From 0a13c00e9d4502b8e3fd9260ce781758ff2c3970 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:44 -0700
Subject: workqueue: reorder queueing functions so that _on() variants are on
 top

Currently, queue/schedule[_delayed]_work_on() are located below the
counterpart without the _on postifx even though the latter is usually
implemented using the former.  Swap them.

This is cleanup and doesn't cause any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 124 ++++++++++++++++++++++++++---------------------------
 1 file changed, 62 insertions(+), 62 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 692d97628a10..07d309e7e359 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1052,27 +1052,6 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	spin_unlock_irqrestore(&gcwq->lock, flags);
 }
 
-/**
- * queue_work - queue work on a workqueue
- * @wq: workqueue to use
- * @work: work to queue
- *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
- *
- * We queue the work to the CPU on which it was submitted, but if the CPU dies
- * it can be processed by another CPU.
- */
-int queue_work(struct workqueue_struct *wq, struct work_struct *work)
-{
-	int ret;
-
-	ret = queue_work_on(get_cpu(), wq, work);
-	put_cpu();
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(queue_work);
-
 /**
  * queue_work_on - queue work on specific cpu
  * @cpu: CPU number to execute work on
@@ -1097,31 +1076,34 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(queue_work_on);
 
-static void delayed_work_timer_fn(unsigned long __data)
-{
-	struct delayed_work *dwork = (struct delayed_work *)__data;
-	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
-
-	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
-}
-
 /**
- * queue_delayed_work - queue work on a workqueue after delay
+ * queue_work - queue work on a workqueue
  * @wq: workqueue to use
- * @dwork: delayable work to queue
- * @delay: number of jiffies to wait before queueing
+ * @work: work to queue
  *
  * Returns 0 if @work was already on a queue, non-zero otherwise.
+ *
+ * We queue the work to the CPU on which it was submitted, but if the CPU dies
+ * it can be processed by another CPU.
  */
-int queue_delayed_work(struct workqueue_struct *wq,
-			struct delayed_work *dwork, unsigned long delay)
+int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	if (delay == 0)
-		return queue_work(wq, &dwork->work);
+	int ret;
 
-	return queue_delayed_work_on(-1, wq, dwork, delay);
+	ret = queue_work_on(get_cpu(), wq, work);
+	put_cpu();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work);
+
+static void delayed_work_timer_fn(unsigned long __data)
+{
+	struct delayed_work *dwork = (struct delayed_work *)__data;
+	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
+
+	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
 }
-EXPORT_SYMBOL_GPL(queue_delayed_work);
 
 /**
  * queue_delayed_work_on - queue work on specific CPU after delay
@@ -1178,6 +1160,24 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
+/**
+ * queue_delayed_work - queue work on a workqueue after delay
+ * @wq: workqueue to use
+ * @dwork: delayable work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
+ */
+int queue_delayed_work(struct workqueue_struct *wq,
+			struct delayed_work *dwork, unsigned long delay)
+{
+	if (delay == 0)
+		return queue_work(wq, &dwork->work);
+
+	return queue_delayed_work_on(-1, wq, dwork, delay);
+}
+EXPORT_SYMBOL_GPL(queue_delayed_work);
+
 /**
  * worker_enter_idle - enter idle state
  * @worker: worker which is entering idle state
@@ -2877,6 +2877,19 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
+/*
+ * schedule_work_on - put work task on a specific cpu
+ * @cpu: cpu to put the work task on
+ * @work: job to be done
+ *
+ * This puts a job on a specific cpu
+ */
+int schedule_work_on(int cpu, struct work_struct *work)
+{
+	return queue_work_on(cpu, system_wq, work);
+}
+EXPORT_SYMBOL(schedule_work_on);
+
 /**
  * schedule_work - put work task in global workqueue
  * @work: job to be done
@@ -2894,18 +2907,21 @@ int schedule_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(schedule_work);
 
-/*
- * schedule_work_on - put work task on a specific cpu
- * @cpu: cpu to put the work task on
- * @work: job to be done
+/**
+ * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
+ * @cpu: cpu to use
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait
  *
- * This puts a job on a specific cpu
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue on the specified CPU.
  */
-int schedule_work_on(int cpu, struct work_struct *work)
+int schedule_delayed_work_on(int cpu,
+			struct delayed_work *dwork, unsigned long delay)
 {
-	return queue_work_on(cpu, system_wq, work);
+	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
 }
-EXPORT_SYMBOL(schedule_work_on);
+EXPORT_SYMBOL(schedule_delayed_work_on);
 
 /**
  * schedule_delayed_work - put work task in global workqueue after delay
@@ -2922,22 +2938,6 @@ int schedule_delayed_work(struct delayed_work *dwork,
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 
-/**
- * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
- * @cpu: cpu to use
- * @dwork: job to be done
- * @delay: number of jiffies to wait
- *
- * After waiting for a given time this puts a job in the kernel-global
- * workqueue on the specified CPU.
- */
-int schedule_delayed_work_on(int cpu,
-			struct delayed_work *dwork, unsigned long delay)
-{
-	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
-}
-EXPORT_SYMBOL(schedule_delayed_work_on);
-
 /**
  * schedule_on_each_cpu - execute a function synchronously on each online CPU
  * @func: the function to call
-- 
cgit v1.2.3


From d4283e9378619c14dc3826a6b0527eb5d967ffde Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:44 -0700
Subject: workqueue: make queueing functions return bool

All queueing functions return 1 on success, 0 if the work item was
already pending.  Update them to return bool instead.  This signifies
better that they don't return 0 / -errno.

This is cleanup and doesn't cause any functional difference.

While at it, fix comment opening for schedule_work_on().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 47 +++++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 07d309e7e359..70f95ab28f3d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1058,19 +1058,19 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
  * @wq: workqueue to use
  * @work: work to queue
  *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.
  *
  * We queue the work to a specific CPU, the caller must ensure it
  * can't go away.
  */
-int
-queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
+bool queue_work_on(int cpu, struct workqueue_struct *wq,
+		   struct work_struct *work)
 {
-	int ret = 0;
+	bool ret = false;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_work(cpu, wq, work);
-		ret = 1;
+		ret = true;
 	}
 	return ret;
 }
@@ -1081,14 +1081,14 @@ EXPORT_SYMBOL_GPL(queue_work_on);
  * @wq: workqueue to use
  * @work: work to queue
  *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.
  *
  * We queue the work to the CPU on which it was submitted, but if the CPU dies
  * it can be processed by another CPU.
  */
-int queue_work(struct workqueue_struct *wq, struct work_struct *work)
+bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret;
+	bool ret;
 
 	ret = queue_work_on(get_cpu(), wq, work);
 	put_cpu();
@@ -1112,14 +1112,14 @@ static void delayed_work_timer_fn(unsigned long __data)
  * @dwork: work to queue
  * @delay: number of jiffies to wait before queueing
  *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.
  */
-int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-			struct delayed_work *dwork, unsigned long delay)
+bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+			   struct delayed_work *dwork, unsigned long delay)
 {
-	int ret = 0;
 	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;
+	bool ret = false;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		unsigned int lcpu;
@@ -1154,7 +1154,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			add_timer_on(timer, cpu);
 		else
 			add_timer(timer);
-		ret = 1;
+		ret = true;
 	}
 	return ret;
 }
@@ -1166,9 +1166,9 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
  * @dwork: delayable work to queue
  * @delay: number of jiffies to wait before queueing
  *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.
  */
-int queue_delayed_work(struct workqueue_struct *wq,
+bool queue_delayed_work(struct workqueue_struct *wq,
 			struct delayed_work *dwork, unsigned long delay)
 {
 	if (delay == 0)
@@ -2877,14 +2877,14 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
-/*
+/**
  * schedule_work_on - put work task on a specific cpu
  * @cpu: cpu to put the work task on
  * @work: job to be done
  *
  * This puts a job on a specific cpu
  */
-int schedule_work_on(int cpu, struct work_struct *work)
+bool schedule_work_on(int cpu, struct work_struct *work)
 {
 	return queue_work_on(cpu, system_wq, work);
 }
@@ -2894,14 +2894,14 @@ EXPORT_SYMBOL(schedule_work_on);
  * schedule_work - put work task in global workqueue
  * @work: job to be done
  *
- * Returns zero if @work was already on the kernel-global workqueue and
- * non-zero otherwise.
+ * Returns %false if @work was already on the kernel-global workqueue and
+ * %true otherwise.
  *
  * This puts a job in the kernel-global workqueue if it was not already
  * queued and leaves it in the same position on the kernel-global
  * workqueue otherwise.
  */
-int schedule_work(struct work_struct *work)
+bool schedule_work(struct work_struct *work)
 {
 	return queue_work(system_wq, work);
 }
@@ -2916,8 +2916,8 @@ EXPORT_SYMBOL(schedule_work);
  * After waiting for a given time this puts a job in the kernel-global
  * workqueue on the specified CPU.
  */
-int schedule_delayed_work_on(int cpu,
-			struct delayed_work *dwork, unsigned long delay)
+bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
+			      unsigned long delay)
 {
 	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
 }
@@ -2931,8 +2931,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
  * After waiting for a given time this puts a job in the kernel-global
  * workqueue.
  */
-int schedule_delayed_work(struct delayed_work *dwork,
-					unsigned long delay)
+bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
 {
 	return queue_delayed_work(system_wq, dwork, delay);
 }
-- 
cgit v1.2.3


From 959d1af8cffc8fd38ed53e8be1cf4ab8782f9c00 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:45 -0700
Subject: workqueue: add missing smp_wmb() in process_one_work()

WORK_STRUCT_PENDING is used to claim ownership of a work item and
process_one_work() releases it before starting execution.  When
someone else grabs PENDING, all pre-release updates to the work item
should be visible and all updates made by the new owner should happen
afterwards.

Grabbing PENDING uses test_and_set_bit() and thus has a full barrier;
however, clearing doesn't have a matching wmb.  Given the preceding
spin_unlock and use of clear_bit, I don't believe this can be a
problem on an actual machine and there hasn't been any related report
but it still is theretically possible for clear_pending to permeate
upwards and happen before work->entry update.

Add an explicit smp_wmb() before work_clear_pending().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: stable@vger.kernel.org
---
 kernel/workqueue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 70f95ab28f3d..5c26d36146b7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1997,7 +1997,9 @@ __acquires(&gcwq->lock)
 
 	spin_unlock_irq(&gcwq->lock);
 
+	smp_wmb();	/* paired with test_and_set_bit(PENDING) */
 	work_clear_pending(work);
+
 	lock_map_acquire_read(&cwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
 	trace_workqueue_execute_start(work);
-- 
cgit v1.2.3


From 8930caba3dbdd8b86dd6934a5920bf61b53a931e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:45 -0700
Subject: workqueue: disable irq while manipulating PENDING

Queueing operations use WORK_STRUCT_PENDING_BIT to synchronize access
to the target work item.  They first try to claim the bit and proceed
with queueing only after that succeeds and there's a window between
PENDING being set and the actual queueing where the task can be
interrupted or preempted.

There's also a similar window in process_one_work() when clearing
PENDING.  A work item is dequeued, gcwq->lock is released and then
PENDING is cleared and the worker might get interrupted or preempted
between releasing gcwq->lock and clearing PENDING.

cancel[_delayed]_work_sync() tries to claim or steal PENDING.  The
function assumes that a work item with PENDING is either queued or in
the process of being [de]queued.  In the latter case, it busy-loops
until either the work item loses PENDING or is queued.  If canceling
coincides with the above described interrupts or preemptions, the
canceling task will busy-loop while the queueing or executing task is
preempted.

This patch keeps irq disabled across claiming PENDING and actual
queueing and moves PENDING clearing in process_one_work() inside
gcwq->lock so that busy looping from PENDING && !queued doesn't wait
for interrupted/preempted tasks.  Note that, in process_one_work(),
setting last CPU and clearing PENDING got merged into single
operation.

This removes possible long busy-loops and will allow using
try_to_grab_pending() from bh and irq contexts.

v2: __queue_work() was testing preempt_count() to ensure that the
    caller has disabled preemption.  This triggers spuriously if
    !CONFIG_PREEMPT_COUNT.  Use preemptible() instead.  Reported by
    Fengguang Wu.

v3: Disable irq instead of preemption.  IRQ will be disabled while
    grabbing gcwq->lock later anyway and this allows using
    try_to_grab_pending() from bh and irq contexts.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
---
 kernel/workqueue.c | 73 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 53 insertions(+), 20 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5c26d36146b7..30474c4e107c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -537,9 +537,10 @@ static int work_next_color(int color)
  * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
  * cleared and the work data contains the cpu number it was last on.
  *
- * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
- * cwq, cpu or clear work->data.  These functions should only be
- * called while the work is owned - ie. while the PENDING bit is set.
+ * set_work_cwq(), set_work_cpu_and_clear_pending() and clear_work_data()
+ * can be used to set the cwq, cpu or clear work->data.  These functions
+ * should only be called while the work is owned - ie. while the PENDING
+ * bit is set.
  *
  * get_work_[g]cwq() can be used to obtain the gcwq or cwq
  * corresponding to a work.  gcwq is available once the work has been
@@ -561,9 +562,10 @@ static void set_work_cwq(struct work_struct *work,
 		      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
 }
 
-static void set_work_cpu(struct work_struct *work, unsigned int cpu)
+static void set_work_cpu_and_clear_pending(struct work_struct *work,
+					   unsigned int cpu)
 {
-	set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
+	set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, 0);
 }
 
 static void clear_work_data(struct work_struct *work)
@@ -981,7 +983,14 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	struct cpu_workqueue_struct *cwq;
 	struct list_head *worklist;
 	unsigned int work_flags;
-	unsigned long flags;
+
+	/*
+	 * While a work item is PENDING && off queue, a task trying to
+	 * steal the PENDING will busy-loop waiting for it to either get
+	 * queued or lose PENDING.  Grabbing PENDING and queueing should
+	 * happen with IRQ disabled.
+	 */
+	WARN_ON_ONCE(!irqs_disabled());
 
 	debug_work_activate(work);
 
@@ -1008,7 +1017,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 		    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
 			struct worker *worker;
 
-			spin_lock_irqsave(&last_gcwq->lock, flags);
+			spin_lock(&last_gcwq->lock);
 
 			worker = find_worker_executing_work(last_gcwq, work);
 
@@ -1016,14 +1025,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 				gcwq = last_gcwq;
 			else {
 				/* meh... not running there, queue here */
-				spin_unlock_irqrestore(&last_gcwq->lock, flags);
-				spin_lock_irqsave(&gcwq->lock, flags);
+				spin_unlock(&last_gcwq->lock);
+				spin_lock(&gcwq->lock);
 			}
-		} else
-			spin_lock_irqsave(&gcwq->lock, flags);
+		} else {
+			spin_lock(&gcwq->lock);
+		}
 	} else {
 		gcwq = get_gcwq(WORK_CPU_UNBOUND);
-		spin_lock_irqsave(&gcwq->lock, flags);
+		spin_lock(&gcwq->lock);
 	}
 
 	/* gcwq determined, get cwq and queue */
@@ -1031,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	trace_workqueue_queue_work(cpu, cwq, work);
 
 	if (WARN_ON(!list_empty(&work->entry))) {
-		spin_unlock_irqrestore(&gcwq->lock, flags);
+		spin_unlock(&gcwq->lock);
 		return;
 	}
 
@@ -1049,7 +1059,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 
 	insert_work(cwq, work, worklist, work_flags);
 
-	spin_unlock_irqrestore(&gcwq->lock, flags);
+	spin_unlock(&gcwq->lock);
 }
 
 /**
@@ -1067,11 +1077,16 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 		   struct work_struct *work)
 {
 	bool ret = false;
+	unsigned long flags;
+
+	local_irq_save(flags);
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_work(cpu, wq, work);
 		ret = true;
 	}
+
+	local_irq_restore(flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work_on);
@@ -1102,7 +1117,9 @@ static void delayed_work_timer_fn(unsigned long __data)
 	struct delayed_work *dwork = (struct delayed_work *)__data;
 	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
 
+	local_irq_disable();
 	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
+	local_irq_enable();
 }
 
 /**
@@ -1120,6 +1137,10 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;
 	bool ret = false;
+	unsigned long flags;
+
+	/* read the comment in __queue_work() */
+	local_irq_save(flags);
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		unsigned int lcpu;
@@ -1156,6 +1177,8 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			add_timer(timer);
 		ret = true;
 	}
+
+	local_irq_restore(flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
@@ -1970,15 +1993,13 @@ __acquires(&gcwq->lock)
 		return;
 	}
 
-	/* claim and process */
+	/* claim and dequeue */
 	debug_work_deactivate(work);
 	hlist_add_head(&worker->hentry, bwh);
 	worker->current_work = work;
 	worker->current_cwq = cwq;
 	work_color = get_work_color(work);
 
-	/* record the current cpu number in the work data and dequeue */
-	set_work_cpu(work, gcwq->cpu);
 	list_del_init(&work->entry);
 
 	/*
@@ -1995,10 +2016,18 @@ __acquires(&gcwq->lock)
 	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
 		wake_up_worker(pool);
 
-	spin_unlock_irq(&gcwq->lock);
+	/*
+	 * Record the last CPU and clear PENDING.  The following wmb is
+	 * paired with the implied mb in test_and_set_bit(PENDING) and
+	 * ensures all updates to @work made here are visible to and
+	 * precede any updates by the next PENDING owner.  Also, clear
+	 * PENDING inside @gcwq->lock so that PENDING and queued state
+	 * changes happen together while IRQ is disabled.
+	 */
+	smp_wmb();
+	set_work_cpu_and_clear_pending(work, gcwq->cpu);
 
-	smp_wmb();	/* paired with test_and_set_bit(PENDING) */
-	work_clear_pending(work);
+	spin_unlock_irq(&gcwq->lock);
 
 	lock_map_acquire_read(&cwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
@@ -2836,9 +2865,11 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
  */
 bool flush_delayed_work(struct delayed_work *dwork)
 {
+	local_irq_disable();
 	if (del_timer_sync(&dwork->timer))
 		__queue_work(raw_smp_processor_id(),
 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
+	local_irq_enable();
 	return flush_work(&dwork->work);
 }
 EXPORT_SYMBOL(flush_delayed_work);
@@ -2857,9 +2888,11 @@ EXPORT_SYMBOL(flush_delayed_work);
  */
 bool flush_delayed_work_sync(struct delayed_work *dwork)
 {
+	local_irq_disable();
 	if (del_timer_sync(&dwork->timer))
 		__queue_work(raw_smp_processor_id(),
 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
+	local_irq_enable();
 	return flush_work_sync(&dwork->work);
 }
 EXPORT_SYMBOL(flush_delayed_work_sync);
-- 
cgit v1.2.3


From d8e794dfd51c368ed3f686b7f4172830b60ae47b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:45 -0700
Subject: workqueue: set delayed_work->timer function on initialization

delayed_work->timer.function is currently initialized during
queue_delayed_work_on().  Export delayed_work_timer_fn() and set
delayed_work timer function during delayed_work initialization
together with other fields.

This ensures the timer function is always valid on an initialized
delayed_work.  This is to help mod_delayed_work() implementation.

To detect delayed_work users which diddle with the internal timer,
trigger WARN if timer function doesn't match on queue.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 30474c4e107c..55392385fe30 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1112,7 +1112,7 @@ bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(queue_work);
 
-static void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
 	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
@@ -1121,6 +1121,7 @@ static void delayed_work_timer_fn(unsigned long __data)
 	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
 	local_irq_enable();
 }
+EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
 
 /**
  * queue_delayed_work_on - queue work on specific CPU after delay
@@ -1145,6 +1146,8 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		unsigned int lcpu;
 
+		WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
+			     timer->data != (unsigned long)dwork);
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
@@ -1168,8 +1171,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 		set_work_cwq(work, get_cwq(lcpu, wq), 0);
 
 		timer->expires = jiffies + delay;
-		timer->data = (unsigned long)dwork;
-		timer->function = delayed_work_timer_fn;
 
 		if (unlikely(cpu >= 0))
 			add_timer_on(timer, cpu);
-- 
cgit v1.2.3


From 57469821fd5c61f25f783827d7334063cff67d65 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:45 -0700
Subject: workqueue: unify local CPU queueing handling

Queueing functions have been using different methods to determine the
local CPU.

* queue_work() superflously uses get/put_cpu() to acquire and hold the
  local CPU across queue_work_on().

* delayed_work_timer_fn() uses smp_processor_id().

* queue_delayed_work() calls queue_delayed_work_on() with -1 @cpu
  which is interpreted as the local CPU.

* flush_delayed_work[_sync]() were using raw_smp_processor_id().

* __queue_work() interprets %WORK_CPU_UNBOUND as local CPU if the
  target workqueue is bound one but nobody uses this.

This patch converts all functions to uniformly use %WORK_CPU_UNBOUND
to indicate local CPU and use the local binding feature of
__queue_work().  unlikely() is dropped from %WORK_CPU_UNBOUND handling
in __queue_work().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 55392385fe30..ce60bb5d12fb 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1003,7 +1003,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	if (!(wq->flags & WQ_UNBOUND)) {
 		struct global_cwq *last_gcwq;
 
-		if (unlikely(cpu == WORK_CPU_UNBOUND))
+		if (cpu == WORK_CPU_UNBOUND)
 			cpu = raw_smp_processor_id();
 
 		/*
@@ -1103,12 +1103,7 @@ EXPORT_SYMBOL_GPL(queue_work_on);
  */
 bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	bool ret;
-
-	ret = queue_work_on(get_cpu(), wq, work);
-	put_cpu();
-
-	return ret;
+	return queue_work_on(WORK_CPU_UNBOUND, wq, work);
 }
 EXPORT_SYMBOL_GPL(queue_work);
 
@@ -1118,7 +1113,7 @@ void delayed_work_timer_fn(unsigned long __data)
 	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
 
 	local_irq_disable();
-	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
+	__queue_work(WORK_CPU_UNBOUND, cwq->wq, &dwork->work);
 	local_irq_enable();
 }
 EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
@@ -1172,7 +1167,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 
 		timer->expires = jiffies + delay;
 
-		if (unlikely(cpu >= 0))
+		if (unlikely(cpu != WORK_CPU_UNBOUND))
 			add_timer_on(timer, cpu);
 		else
 			add_timer(timer);
@@ -1198,7 +1193,7 @@ bool queue_delayed_work(struct workqueue_struct *wq,
 	if (delay == 0)
 		return queue_work(wq, &dwork->work);
 
-	return queue_delayed_work_on(-1, wq, dwork, delay);
+	return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work);
 
@@ -2868,7 +2863,7 @@ bool flush_delayed_work(struct delayed_work *dwork)
 {
 	local_irq_disable();
 	if (del_timer_sync(&dwork->timer))
-		__queue_work(raw_smp_processor_id(),
+		__queue_work(WORK_CPU_UNBOUND,
 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
 	local_irq_enable();
 	return flush_work(&dwork->work);
@@ -2891,7 +2886,7 @@ bool flush_delayed_work_sync(struct delayed_work *dwork)
 {
 	local_irq_disable();
 	if (del_timer_sync(&dwork->timer))
-		__queue_work(raw_smp_processor_id(),
+		__queue_work(WORK_CPU_UNBOUND,
 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
 	local_irq_enable();
 	return flush_work_sync(&dwork->work);
-- 
cgit v1.2.3


From 715f1300802e6eaefa85f6cfc70ae99af3d5d497 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:46 -0700
Subject: workqueue: fix zero @delay handling of queue_delayed_work_on()

If @delay is zero and the dealyed_work is idle, queue_delayed_work()
queues it for immediate execution; however, queue_delayed_work_on()
lacks this logic and always goes through timer regardless of @delay.

This patch moves 0 @delay handling logic from queue_delayed_work() to
queue_delayed_work_on() so that both functions behave the same.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ce60bb5d12fb..6cbdc22f8ec7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1125,7 +1125,9 @@ EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
  * @dwork: work to queue
  * @delay: number of jiffies to wait before queueing
  *
- * Returns %false if @work was already on a queue, %true otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.  If
+ * @delay is zero and @dwork is idle, it will be scheduled for immediate
+ * execution.
  */
 bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			   struct delayed_work *dwork, unsigned long delay)
@@ -1135,6 +1137,9 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	bool ret = false;
 	unsigned long flags;
 
+	if (!delay)
+		return queue_work_on(cpu, wq, &dwork->work);
+
 	/* read the comment in __queue_work() */
 	local_irq_save(flags);
 
@@ -1185,14 +1190,11 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
  * @dwork: delayable work to queue
  * @delay: number of jiffies to wait before queueing
  *
- * Returns %false if @work was already on a queue, %true otherwise.
+ * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
  */
 bool queue_delayed_work(struct workqueue_struct *wq,
 			struct delayed_work *dwork, unsigned long delay)
 {
-	if (delay == 0)
-		return queue_work(wq, &dwork->work);
-
 	return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work);
-- 
cgit v1.2.3


From bf4ede014ea886b71ef71368738da35b316cb7c0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:46 -0700
Subject: workqueue: move try_to_grab_pending() upwards

try_to_grab_pending() will be used by to-be-implemented
mod_delayed_work[_on]().  Move try_to_grab_pending() and related
functions above queueing functions.

This patch only moves functions around.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 286 ++++++++++++++++++++++++++---------------------------
 1 file changed, 143 insertions(+), 143 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6cbdc22f8ec7..0f50f4078e36 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -903,6 +903,149 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 					    work);
 }
 
+/**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head.  Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work.  This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+			      struct work_struct **nextp)
+{
+	struct work_struct *n;
+
+	/*
+	 * Linked worklist will always end before the end of the list,
+	 * use NULL for list head.
+	 */
+	list_for_each_entry_safe_from(work, n, NULL, entry) {
+		list_move_tail(&work->entry, head);
+		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+			break;
+	}
+
+	/*
+	 * If we're already inside safe list traversal and have moved
+	 * multiple works to the scheduled queue, the next position
+	 * needs to be updated.
+	 */
+	if (nextp)
+		*nextp = n;
+}
+
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+	struct work_struct *work = list_first_entry(&cwq->delayed_works,
+						    struct work_struct, entry);
+
+	trace_workqueue_activate_work(work);
+	move_linked_works(work, &cwq->pool->worklist, NULL);
+	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
+	cwq->nr_active++;
+}
+
+/**
+ * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * @cwq: cwq of interest
+ * @color: color of work which left the queue
+ * @delayed: for a delayed work
+ *
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
+				 bool delayed)
+{
+	/* ignore uncolored works */
+	if (color == WORK_NO_COLOR)
+		return;
+
+	cwq->nr_in_flight[color]--;
+
+	if (!delayed) {
+		cwq->nr_active--;
+		if (!list_empty(&cwq->delayed_works)) {
+			/* one down, submit a delayed one */
+			if (cwq->nr_active < cwq->max_active)
+				cwq_activate_first_delayed(cwq);
+		}
+	}
+
+	/* is flush in progress and are we at the flushing tip? */
+	if (likely(cwq->flush_color != color))
+		return;
+
+	/* are there still in-flight works? */
+	if (cwq->nr_in_flight[color])
+		return;
+
+	/* this cwq is done, clear flush_color */
+	cwq->flush_color = -1;
+
+	/*
+	 * If this was the last cwq, wake up the first flusher.  It
+	 * will handle the rest.
+	 */
+	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+		complete(&cwq->wq->first_flusher->done);
+}
+
+/*
+ * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
+ * so this work can't be re-armed in any way.
+ */
+static int try_to_grab_pending(struct work_struct *work)
+{
+	struct global_cwq *gcwq;
+	int ret = -1;
+
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
+		return 0;
+
+	/*
+	 * The queueing is in progress, or it is already queued. Try to
+	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+	 */
+	gcwq = get_work_gcwq(work);
+	if (!gcwq)
+		return ret;
+
+	spin_lock_irq(&gcwq->lock);
+	if (!list_empty(&work->entry)) {
+		/*
+		 * This work is queued, but perhaps we locked the wrong gcwq.
+		 * In that case we must see the new value after rmb(), see
+		 * insert_work()->wmb().
+		 */
+		smp_rmb();
+		if (gcwq == get_work_gcwq(work)) {
+			debug_work_deactivate(work);
+			list_del_init(&work->entry);
+			cwq_dec_nr_in_flight(get_work_cwq(work),
+				get_work_color(work),
+				*work_data_bits(work) & WORK_STRUCT_DELAYED);
+			ret = 1;
+		}
+	}
+	spin_unlock_irq(&gcwq->lock);
+
+	return ret;
+}
+
 /**
  * insert_work - insert a work into gcwq
  * @cwq: cwq @work belongs to
@@ -1831,107 +1974,6 @@ static bool manage_workers(struct worker *worker)
 	return ret;
 }
 
-/**
- * move_linked_works - move linked works to a list
- * @work: start of series of works to be scheduled
- * @head: target list to append @work to
- * @nextp: out paramter for nested worklist walking
- *
- * Schedule linked works starting from @work to @head.  Work series to
- * be scheduled starts at @work and includes any consecutive work with
- * WORK_STRUCT_LINKED set in its predecessor.
- *
- * If @nextp is not NULL, it's updated to point to the next work of
- * the last scheduled work.  This allows move_linked_works() to be
- * nested inside outer list_for_each_entry_safe().
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void move_linked_works(struct work_struct *work, struct list_head *head,
-			      struct work_struct **nextp)
-{
-	struct work_struct *n;
-
-	/*
-	 * Linked worklist will always end before the end of the list,
-	 * use NULL for list head.
-	 */
-	list_for_each_entry_safe_from(work, n, NULL, entry) {
-		list_move_tail(&work->entry, head);
-		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
-			break;
-	}
-
-	/*
-	 * If we're already inside safe list traversal and have moved
-	 * multiple works to the scheduled queue, the next position
-	 * needs to be updated.
-	 */
-	if (nextp)
-		*nextp = n;
-}
-
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
-{
-	struct work_struct *work = list_first_entry(&cwq->delayed_works,
-						    struct work_struct, entry);
-
-	trace_workqueue_activate_work(work);
-	move_linked_works(work, &cwq->pool->worklist, NULL);
-	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
-	cwq->nr_active++;
-}
-
-/**
- * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
- * @cwq: cwq of interest
- * @color: color of work which left the queue
- * @delayed: for a delayed work
- *
- * A work either has completed or is removed from pending queue,
- * decrement nr_in_flight of its cwq and handle workqueue flushing.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
-				 bool delayed)
-{
-	/* ignore uncolored works */
-	if (color == WORK_NO_COLOR)
-		return;
-
-	cwq->nr_in_flight[color]--;
-
-	if (!delayed) {
-		cwq->nr_active--;
-		if (!list_empty(&cwq->delayed_works)) {
-			/* one down, submit a delayed one */
-			if (cwq->nr_active < cwq->max_active)
-				cwq_activate_first_delayed(cwq);
-		}
-	}
-
-	/* is flush in progress and are we at the flushing tip? */
-	if (likely(cwq->flush_color != color))
-		return;
-
-	/* are there still in-flight works? */
-	if (cwq->nr_in_flight[color])
-		return;
-
-	/* this cwq is done, clear flush_color */
-	cwq->flush_color = -1;
-
-	/*
-	 * If this was the last cwq, wake up the first flusher.  It
-	 * will handle the rest.
-	 */
-	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
-		complete(&cwq->wq->first_flusher->done);
-}
-
 /**
  * process_one_work - process single work
  * @worker: self
@@ -2767,48 +2809,6 @@ bool flush_work_sync(struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(flush_work_sync);
 
-/*
- * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
- * so this work can't be re-armed in any way.
- */
-static int try_to_grab_pending(struct work_struct *work)
-{
-	struct global_cwq *gcwq;
-	int ret = -1;
-
-	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
-		return 0;
-
-	/*
-	 * The queueing is in progress, or it is already queued. Try to
-	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
-	 */
-	gcwq = get_work_gcwq(work);
-	if (!gcwq)
-		return ret;
-
-	spin_lock_irq(&gcwq->lock);
-	if (!list_empty(&work->entry)) {
-		/*
-		 * This work is queued, but perhaps we locked the wrong gcwq.
-		 * In that case we must see the new value after rmb(), see
-		 * insert_work()->wmb().
-		 */
-		smp_rmb();
-		if (gcwq == get_work_gcwq(work)) {
-			debug_work_deactivate(work);
-			list_del_init(&work->entry);
-			cwq_dec_nr_in_flight(get_work_cwq(work),
-				get_work_color(work),
-				*work_data_bits(work) & WORK_STRUCT_DELAYED);
-			ret = 1;
-		}
-	}
-	spin_unlock_irq(&gcwq->lock);
-
-	return ret;
-}
-
 static bool __cancel_work_timer(struct work_struct *work,
 				struct timer_list* timer)
 {
-- 
cgit v1.2.3


From b5490077274482efde57a50b060b99bc839acd45 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:46 -0700
Subject: workqueue: introduce WORK_OFFQ_FLAG_*

Low WORK_STRUCT_FLAG_BITS bits of work_struct->data contain
WORK_STRUCT_FLAG_* and flush color.  If the work item is queued, the
rest point to the cpu_workqueue with WORK_STRUCT_CWQ set; otherwise,
WORK_STRUCT_CWQ is clear and the bits contain the last CPU number -
either a real CPU number or one of WORK_CPU_*.

Scheduled addition of mod_delayed_work[_on]() requires an additional
flag, which is used only while a work item is off queue.  There are
more than enough bits to represent off-queue CPU number on both 32 and
64bits.  This patch introduces WORK_OFFQ_FLAG_* which occupy the lower
part of the @work->data high bits while off queue.  This patch doesn't
define any actual OFFQ flag yet.

Off-queue CPU number is now shifted by WORK_OFFQ_CPU_SHIFT, which adds
the number of bits used by OFFQ flags to WORK_STRUCT_FLAG_SHIFT, to
make room for OFFQ flags.

To avoid shift width warning with large WORK_OFFQ_FLAG_BITS, ulong
cast is added to WORK_STRUCT_NO_CPU and, just in case, BUILD_BUG_ON()
to check that there are enough bits to accomodate off-queue CPU number
is added.

This patch doesn't make any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0f50f4078e36..eeae77079483 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -533,9 +533,9 @@ static int work_next_color(int color)
 }
 
 /*
- * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
- * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
- * cleared and the work data contains the cpu number it was last on.
+ * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
+ * contain the pointer to the queued cwq.  Once execution starts, the flag
+ * is cleared and the high bits contain OFFQ flags and CPU number.
  *
  * set_work_cwq(), set_work_cpu_and_clear_pending() and clear_work_data()
  * can be used to set the cwq, cpu or clear work->data.  These functions
@@ -565,7 +565,7 @@ static void set_work_cwq(struct work_struct *work,
 static void set_work_cpu_and_clear_pending(struct work_struct *work,
 					   unsigned int cpu)
 {
-	set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, 0);
+	set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
 }
 
 static void clear_work_data(struct work_struct *work)
@@ -592,7 +592,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 		return ((struct cpu_workqueue_struct *)
 			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
 
-	cpu = data >> WORK_STRUCT_FLAG_BITS;
+	cpu = data >> WORK_OFFQ_CPU_SHIFT;
 	if (cpu == WORK_CPU_NONE)
 		return NULL;
 
@@ -3724,6 +3724,10 @@ static int __init init_workqueues(void)
 	unsigned int cpu;
 	int i;
 
+	/* make sure we have enough bits for OFFQ CPU number */
+	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
+		     WORK_CPU_LAST);
+
 	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
 	cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
 
-- 
cgit v1.2.3


From 7beb2edf44b4dea820c733046ad7666d092bb4b6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:46 -0700
Subject: workqueue: factor out __queue_delayed_work() from
 queue_delayed_work_on()

This is to prepare for mod_delayed_work[_on]() and doesn't cause any
functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 74 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 41 insertions(+), 33 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index eeae77079483..d7f1b7e2bbaa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1261,6 +1261,46 @@ void delayed_work_timer_fn(unsigned long __data)
 }
 EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
 
+static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
+				struct delayed_work *dwork, unsigned long delay)
+{
+	struct timer_list *timer = &dwork->timer;
+	struct work_struct *work = &dwork->work;
+	unsigned int lcpu;
+
+	WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
+		     timer->data != (unsigned long)dwork);
+	BUG_ON(timer_pending(timer));
+	BUG_ON(!list_empty(&work->entry));
+
+	timer_stats_timer_set_start_info(&dwork->timer);
+
+	/*
+	 * This stores cwq for the moment, for the timer_fn.  Note that the
+	 * work's gcwq is preserved to allow reentrance detection for
+	 * delayed works.
+	 */
+	if (!(wq->flags & WQ_UNBOUND)) {
+		struct global_cwq *gcwq = get_work_gcwq(work);
+
+		if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
+			lcpu = gcwq->cpu;
+		else
+			lcpu = raw_smp_processor_id();
+	} else {
+		lcpu = WORK_CPU_UNBOUND;
+	}
+
+	set_work_cwq(work, get_cwq(lcpu, wq), 0);
+
+	timer->expires = jiffies + delay;
+
+	if (unlikely(cpu != WORK_CPU_UNBOUND))
+		add_timer_on(timer, cpu);
+	else
+		add_timer(timer);
+}
+
 /**
  * queue_delayed_work_on - queue work on specific CPU after delay
  * @cpu: CPU number to execute work on
@@ -1275,7 +1315,6 @@ EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
 bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			   struct delayed_work *dwork, unsigned long delay)
 {
-	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;
 	bool ret = false;
 	unsigned long flags;
@@ -1287,38 +1326,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	local_irq_save(flags);
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
-		unsigned int lcpu;
-
-		WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
-			     timer->data != (unsigned long)dwork);
-		BUG_ON(timer_pending(timer));
-		BUG_ON(!list_empty(&work->entry));
-
-		timer_stats_timer_set_start_info(&dwork->timer);
-
-		/*
-		 * This stores cwq for the moment, for the timer_fn.
-		 * Note that the work's gcwq is preserved to allow
-		 * reentrance detection for delayed works.
-		 */
-		if (!(wq->flags & WQ_UNBOUND)) {
-			struct global_cwq *gcwq = get_work_gcwq(work);
-
-			if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
-				lcpu = gcwq->cpu;
-			else
-				lcpu = raw_smp_processor_id();
-		} else
-			lcpu = WORK_CPU_UNBOUND;
-
-		set_work_cwq(work, get_cwq(lcpu, wq), 0);
-
-		timer->expires = jiffies + delay;
-
-		if (unlikely(cpu != WORK_CPU_UNBOUND))
-			add_timer_on(timer, cpu);
-		else
-			add_timer(timer);
+		__queue_delayed_work(cpu, wq, dwork, delay);
 		ret = true;
 	}
 
-- 
cgit v1.2.3


From 36e227d242f9ec7cb4a8e968561b3b26e3d8b5d1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:46 -0700
Subject: workqueue: reorganize try_to_grab_pending() and __cancel_timer_work()

* Use bool @is_dwork instead of @timer and let try_to_grab_pending()
  use to_delayed_work() to determine the delayed_work address.

* Move timer handling from __cancel_work_timer() to
  try_to_grab_pending().

* Make try_to_grab_pending() use -EAGAIN instead of -1 for
  busy-looping and drop the ret local variable.

* Add proper function comment to try_to_grab_pending().

This makes the code a bit easier to understand and will ease further
changes.  This patch doesn't make any functional change.

v2: Use @is_dwork instead of @timer.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d7f1b7e2bbaa..4b3663b1c677 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1004,15 +1004,33 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
 		complete(&cwq->wq->first_flusher->done);
 }
 
-/*
- * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
- * so this work can't be re-armed in any way.
+/**
+ * try_to_grab_pending - steal work item from worklist
+ * @work: work item to steal
+ * @is_dwork: @work is a delayed_work
+ *
+ * Try to grab PENDING bit of @work.  This function can handle @work in any
+ * stable state - idle, on timer or on worklist.  Return values are
+ *
+ *  1		if @work was pending and we successfully stole PENDING
+ *  0		if @work was idle and we claimed PENDING
+ *  -EAGAIN	if PENDING couldn't be grabbed at the moment, safe to busy-retry
+ *
+ * On >= 0 return, the caller owns @work's PENDING bit.
  */
-static int try_to_grab_pending(struct work_struct *work)
+static int try_to_grab_pending(struct work_struct *work, bool is_dwork)
 {
 	struct global_cwq *gcwq;
-	int ret = -1;
 
+	/* try to steal the timer if it exists */
+	if (is_dwork) {
+		struct delayed_work *dwork = to_delayed_work(work);
+
+		if (likely(del_timer(&dwork->timer)))
+			return 1;
+	}
+
+	/* try to claim PENDING the normal way */
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
 		return 0;
 
@@ -1022,7 +1040,7 @@ static int try_to_grab_pending(struct work_struct *work)
 	 */
 	gcwq = get_work_gcwq(work);
 	if (!gcwq)
-		return ret;
+		return -EAGAIN;
 
 	spin_lock_irq(&gcwq->lock);
 	if (!list_empty(&work->entry)) {
@@ -1038,12 +1056,14 @@ static int try_to_grab_pending(struct work_struct *work)
 			cwq_dec_nr_in_flight(get_work_cwq(work),
 				get_work_color(work),
 				*work_data_bits(work) & WORK_STRUCT_DELAYED);
-			ret = 1;
+
+			spin_unlock_irq(&gcwq->lock);
+			return 1;
 		}
 	}
 	spin_unlock_irq(&gcwq->lock);
 
-	return ret;
+	return -EAGAIN;
 }
 
 /**
@@ -2817,15 +2837,12 @@ bool flush_work_sync(struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(flush_work_sync);
 
-static bool __cancel_work_timer(struct work_struct *work,
-				struct timer_list* timer)
+static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 {
 	int ret;
 
 	do {
-		ret = (timer && likely(del_timer(timer)));
-		if (!ret)
-			ret = try_to_grab_pending(work);
+		ret = try_to_grab_pending(work, is_dwork);
 		wait_on_work(work);
 	} while (unlikely(ret < 0));
 
@@ -2853,7 +2870,7 @@ static bool __cancel_work_timer(struct work_struct *work,
  */
 bool cancel_work_sync(struct work_struct *work)
 {
-	return __cancel_work_timer(work, NULL);
+	return __cancel_work_timer(work, false);
 }
 EXPORT_SYMBOL_GPL(cancel_work_sync);
 
@@ -2914,7 +2931,7 @@ EXPORT_SYMBOL(flush_delayed_work_sync);
  */
 bool cancel_delayed_work_sync(struct delayed_work *dwork)
 {
-	return __cancel_work_timer(&dwork->work, &dwork->timer);
+	return __cancel_work_timer(&dwork->work, true);
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
-- 
cgit v1.2.3


From bbb68dfaba73e8338fe0f1dc711cc1d261daec87 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:46 -0700
Subject: workqueue: mark a work item being canceled as such

There can be two reasons try_to_grab_pending() can fail with -EAGAIN.
One is when someone else is queueing or deqeueing the work item.  With
the previous patches, it is guaranteed that PENDING and queued state
will soon agree making it safe to busy-retry in this case.

The other is if multiple __cancel_work_timer() invocations are racing
one another.  __cancel_work_timer() grabs PENDING and then waits for
running instances of the target work item on all CPUs while holding
PENDING and !queued.  try_to_grab_pending() invoked from another task
will keep returning -EAGAIN while the current owner is waiting.

Not distinguishing the two cases is okay because __cancel_work_timer()
is the only user of try_to_grab_pending() and it invokes
wait_on_work() whenever grabbing fails.  For the first case, busy
looping should be fine but wait_on_work() doesn't cause any critical
problem.  For the latter case, the new contender usually waits for the
same condition as the current owner, so no unnecessarily extended
busy-looping happens.  Combined, these make __cancel_work_timer()
technically correct even without irq protection while grabbing PENDING
or distinguishing the two different cases.

While the current code is technically correct, not distinguishing the
two cases makes it difficult to use try_to_grab_pending() for other
purposes than canceling because it's impossible to tell whether it's
safe to busy-retry grabbing.

This patch adds a mechanism to mark a work item being canceled.
try_to_grab_pending() now disables irq on success and returns -EAGAIN
to indicate that grabbing failed but PENDING and queued states are
gonna agree soon and it's safe to busy-loop.  It returns -ENOENT if
the work item is being canceled and it may stay PENDING && !queued for
arbitrary amount of time.

__cancel_work_timer() is modified to mark the work canceling with
WORK_OFFQ_CANCELING after grabbing PENDING, thus making
try_to_grab_pending() fail with -ENOENT instead of -EAGAIN.  Also, it
invokes wait_on_work() iff grabbing failed with -ENOENT.  This isn't
necessary for correctness but makes it consistent with other future
users of try_to_grab_pending().

v2: try_to_grab_pending() was testing preempt_count() to ensure that
    the caller has disabled preemption.  This triggers spuriously if
    !CONFIG_PREEMPT_COUNT.  Use preemptible() instead.  Reported by
    Fengguang Wu.

v3: Updated so that try_to_grab_pending() disables irq on success
    rather than requiring preemption disabled by the caller.  This
    makes busy-looping easier and will allow try_to_grap_pending() to
    be used from bh/irq contexts.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
---
 kernel/workqueue.c | 90 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 72 insertions(+), 18 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4b3663b1c677..b4a4e05c89e1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -537,15 +537,20 @@ static int work_next_color(int color)
  * contain the pointer to the queued cwq.  Once execution starts, the flag
  * is cleared and the high bits contain OFFQ flags and CPU number.
  *
- * set_work_cwq(), set_work_cpu_and_clear_pending() and clear_work_data()
- * can be used to set the cwq, cpu or clear work->data.  These functions
- * should only be called while the work is owned - ie. while the PENDING
- * bit is set.
+ * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
+ * and clear_work_data() can be used to set the cwq, cpu or clear
+ * work->data.  These functions should only be called while the work is
+ * owned - ie. while the PENDING bit is set.
  *
- * get_work_[g]cwq() can be used to obtain the gcwq or cwq
- * corresponding to a work.  gcwq is available once the work has been
- * queued anywhere after initialization.  cwq is available only from
- * queueing until execution starts.
+ * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
+ * a work.  gcwq is available once the work has been queued anywhere after
+ * initialization until it is sync canceled.  cwq is available only while
+ * the work item is queued.
+ *
+ * %WORK_OFFQ_CANCELING is used to mark a work item which is being
+ * canceled.  While being canceled, a work item may have its PENDING set
+ * but stay off timer and worklist for arbitrarily long and nobody should
+ * try to steal the PENDING bit.
  */
 static inline void set_work_data(struct work_struct *work, unsigned long data,
 				 unsigned long flags)
@@ -600,6 +605,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 	return get_gcwq(cpu);
 }
 
+static void mark_work_canceling(struct work_struct *work)
+{
+	struct global_cwq *gcwq = get_work_gcwq(work);
+	unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
+
+	set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
+		      WORK_STRUCT_PENDING);
+}
+
+static bool work_is_canceling(struct work_struct *work)
+{
+	unsigned long data = atomic_long_read(&work->data);
+
+	return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
+}
+
 /*
  * Policy functions.  These define the policies on how the global worker
  * pools are managed.  Unless noted otherwise, these functions assume that
@@ -1005,9 +1026,10 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
 }
 
 /**
- * try_to_grab_pending - steal work item from worklist
+ * try_to_grab_pending - steal work item from worklist and disable irq
  * @work: work item to steal
  * @is_dwork: @work is a delayed_work
+ * @flags: place to store irq state
  *
  * Try to grab PENDING bit of @work.  This function can handle @work in any
  * stable state - idle, on timer or on worklist.  Return values are
@@ -1015,13 +1037,30 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
  *  1		if @work was pending and we successfully stole PENDING
  *  0		if @work was idle and we claimed PENDING
  *  -EAGAIN	if PENDING couldn't be grabbed at the moment, safe to busy-retry
+ *  -ENOENT	if someone else is canceling @work, this state may persist
+ *		for arbitrarily long
  *
- * On >= 0 return, the caller owns @work's PENDING bit.
+ * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
+ * preempted while holding PENDING and @work off queue, preemption must be
+ * disabled on entry.  This ensures that we don't return -EAGAIN while
+ * another task is preempted in this function.
+ *
+ * On successful return, >= 0, irq is disabled and the caller is
+ * responsible for releasing it using local_irq_restore(*@flags).
+ *
+ * This function is safe to call from any context other than IRQ handler.
+ * An IRQ handler may run on top of delayed_work_timer_fn() which can make
+ * this function return -EAGAIN perpetually.
  */
-static int try_to_grab_pending(struct work_struct *work, bool is_dwork)
+static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
+			       unsigned long *flags)
 {
 	struct global_cwq *gcwq;
 
+	WARN_ON_ONCE(in_irq());
+
+	local_irq_save(*flags);
+
 	/* try to steal the timer if it exists */
 	if (is_dwork) {
 		struct delayed_work *dwork = to_delayed_work(work);
@@ -1040,9 +1079,9 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork)
 	 */
 	gcwq = get_work_gcwq(work);
 	if (!gcwq)
-		return -EAGAIN;
+		goto fail;
 
-	spin_lock_irq(&gcwq->lock);
+	spin_lock(&gcwq->lock);
 	if (!list_empty(&work->entry)) {
 		/*
 		 * This work is queued, but perhaps we locked the wrong gcwq.
@@ -1057,12 +1096,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork)
 				get_work_color(work),
 				*work_data_bits(work) & WORK_STRUCT_DELAYED);
 
-			spin_unlock_irq(&gcwq->lock);
+			spin_unlock(&gcwq->lock);
 			return 1;
 		}
 	}
-	spin_unlock_irq(&gcwq->lock);
-
+	spin_unlock(&gcwq->lock);
+fail:
+	local_irq_restore(*flags);
+	if (work_is_canceling(work))
+		return -ENOENT;
+	cpu_relax();
 	return -EAGAIN;
 }
 
@@ -2839,13 +2882,24 @@ EXPORT_SYMBOL_GPL(flush_work_sync);
 
 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 {
+	unsigned long flags;
 	int ret;
 
 	do {
-		ret = try_to_grab_pending(work, is_dwork);
-		wait_on_work(work);
+		ret = try_to_grab_pending(work, is_dwork, &flags);
+		/*
+		 * If someone else is canceling, wait for the same event it
+		 * would be waiting for before retrying.
+		 */
+		if (unlikely(ret == -ENOENT))
+			wait_on_work(work);
 	} while (unlikely(ret < 0));
 
+	/* tell other tasks trying to grab @work to back off */
+	mark_work_canceling(work);
+	local_irq_restore(flags);
+
+	wait_on_work(work);
 	clear_work_data(work);
 	return ret;
 }
-- 
cgit v1.2.3


From 8376fe22c7e79c7e90857d39f82aeae6cad6c4b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Aug 2012 10:30:47 -0700
Subject: workqueue: implement mod_delayed_work[_on]()

Workqueue was lacking a mechanism to modify the timeout of an already
pending delayed_work.  delayed_work users have been working around
this using several methods - using an explicit timer + work item,
messing directly with delayed_work->timer, and canceling before
re-queueing, all of which are error-prone and/or ugly.

This patch implements mod_delayed_work[_on]() which behaves similarly
to mod_timer() - if the delayed_work is idle, it's queued with the
given delay; otherwise, its timeout is modified to the new value.
Zero @delay guarantees immediate execution.

v2: Updated to reflect try_to_grab_pending() changes.  Now safe to be
    called from bh context.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
---
 kernel/workqueue.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b4a4e05c89e1..41ae2c0979fe 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1413,6 +1413,59 @@ bool queue_delayed_work(struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work);
 
+/**
+ * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
+ * modify @dwork's timer so that it expires after @delay.  If @delay is
+ * zero, @work is guaranteed to be scheduled immediately regardless of its
+ * current state.
+ *
+ * Returns %false if @dwork was idle and queued, %true if @dwork was
+ * pending and its timer was modified.
+ *
+ * This function is safe to call from any context other than IRQ handler.
+ * See try_to_grab_pending() for details.
+ */
+bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
+			 struct delayed_work *dwork, unsigned long delay)
+{
+	unsigned long flags;
+	int ret;
+
+	do {
+		ret = try_to_grab_pending(&dwork->work, true, &flags);
+	} while (unlikely(ret == -EAGAIN));
+
+	if (likely(ret >= 0)) {
+		__queue_delayed_work(cpu, wq, dwork, delay);
+		local_irq_restore(flags);
+	}
+
+	/* -ENOENT from try_to_grab_pending() becomes %true */
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mod_delayed_work_on);
+
+/**
+ * mod_delayed_work - modify delay of or queue a delayed work
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * mod_delayed_work_on() on local CPU.
+ */
+bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
+		      unsigned long delay)
+{
+	return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
+}
+EXPORT_SYMBOL_GPL(mod_delayed_work);
+
 /**
  * worker_enter_idle - enter idle state
  * @worker: worker which is entering idle state
-- 
cgit v1.2.3


From 1265057fa02c7bed3b6d9ddc8a2048065a370364 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Aug 2012 09:38:42 -0700
Subject: workqueue: fix CPU binding of flush_delayed_work[_sync]()

delayed_work encodes the workqueue to use and the last CPU in
delayed_work->work.data while it's on timer.  The target CPU is
implicitly recorded as the CPU the timer is queued on and
delayed_work_timer_fn() queues delayed_work->work to the CPU it is
running on.

Unfortunately, this leaves flush_delayed_work[_sync]() no way to find
out which CPU the delayed_work was queued for when they try to
re-queue after killing the timer.  Currently, it chooses the local CPU
flush is running on.  This can unexpectedly move a delayed_work queued
on a specific CPU to another CPU and lead to subtle errors.

There isn't much point in trying to save several bytes in struct
delayed_work, which is already close to a hundred bytes on 64bit with
all debug options turned off.  This patch adds delayed_work->cpu to
remember the CPU it's queued for.

Note that if the timer is migrated during CPU down, the work item
could be queued to the downed global_cwq after this change.  As a
detached global_cwq behaves like an unbound one, this doesn't change
much for the delayed_work.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/workqueue.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 41ae2c0979fe..11723c5b2b20 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1319,7 +1319,7 @@ void delayed_work_timer_fn(unsigned long __data)
 	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
 
 	local_irq_disable();
-	__queue_work(WORK_CPU_UNBOUND, cwq->wq, &dwork->work);
+	__queue_work(dwork->cpu, cwq->wq, &dwork->work);
 	local_irq_enable();
 }
 EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
@@ -1356,6 +1356,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 
 	set_work_cwq(work, get_cwq(lcpu, wq), 0);
 
+	dwork->cpu = cpu;
 	timer->expires = jiffies + delay;
 
 	if (unlikely(cpu != WORK_CPU_UNBOUND))
@@ -2997,7 +2998,7 @@ bool flush_delayed_work(struct delayed_work *dwork)
 {
 	local_irq_disable();
 	if (del_timer_sync(&dwork->timer))
-		__queue_work(WORK_CPU_UNBOUND,
+		__queue_work(dwork->cpu,
 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
 	local_irq_enable();
 	return flush_work(&dwork->work);
@@ -3020,7 +3021,7 @@ bool flush_delayed_work_sync(struct delayed_work *dwork)
 {
 	local_irq_disable();
 	if (del_timer_sync(&dwork->timer))
-		__queue_work(WORK_CPU_UNBOUND,
+		__queue_work(dwork->cpu,
 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
 	local_irq_enable();
 	return flush_work_sync(&dwork->work);
-- 
cgit v1.2.3


From 23657bb192f14b789e4c478def8f11ecc95b4f6c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 13 Aug 2012 17:08:19 -0700
Subject: workqueue: add missing wmb() in clear_work_data()

Any operation which clears PENDING should be preceded by a wmb to
guarantee that the next PENDING owner sees all the changes made before
PENDING release.

There are only two places where PENDING is cleared -
set_work_cpu_and_clear_pending() and clear_work_data().  The caller of
the former already does smp_wmb() but the latter doesn't have any.

Move the wmb above set_work_cpu_and_clear_pending() into it and add
one to clear_work_data().

There hasn't been any report related to this issue, and, given how
clear_work_data() is used, it is extremely unlikely to have caused any
actual problems on any architecture.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 kernel/workqueue.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 11723c5b2b20..4fef9527a620 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -570,11 +570,19 @@ static void set_work_cwq(struct work_struct *work,
 static void set_work_cpu_and_clear_pending(struct work_struct *work,
 					   unsigned int cpu)
 {
+	/*
+	 * The following wmb is paired with the implied mb in
+	 * test_and_set_bit(PENDING) and ensures all updates to @work made
+	 * here are visible to and precede any updates by the next PENDING
+	 * owner.
+	 */
+	smp_wmb();
 	set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
 }
 
 static void clear_work_data(struct work_struct *work)
 {
+	smp_wmb();	/* see set_work_cpu_and_clear_pending() */
 	set_work_data(work, WORK_STRUCT_NO_CPU, 0);
 }
 
@@ -2182,14 +2190,11 @@ __acquires(&gcwq->lock)
 		wake_up_worker(pool);
 
 	/*
-	 * Record the last CPU and clear PENDING.  The following wmb is
-	 * paired with the implied mb in test_and_set_bit(PENDING) and
-	 * ensures all updates to @work made here are visible to and
-	 * precede any updates by the next PENDING owner.  Also, clear
-	 * PENDING inside @gcwq->lock so that PENDING and queued state
-	 * changes happen together while IRQ is disabled.
+	 * Record the last CPU and clear PENDING which should be the last
+	 * update to @work.  Also, do this inside @gcwq->lock so that
+	 * PENDING and queued state changes happen together while IRQ is
+	 * disabled.
 	 */
-	smp_wmb();
 	set_work_cpu_and_clear_pending(work, gcwq->cpu);
 
 	spin_unlock_irq(&gcwq->lock);
-- 
cgit v1.2.3


From 330dad5b9c9555632578c00e94e85c122561c5c7 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Wed, 15 Aug 2012 23:25:36 +0900
Subject: workqueue: use enum value to set array size of pools in gcwq

Commit 3270476a6c0ce322354df8679652f060d66526dc ('workqueue: reimplement
WQ_HIGHPRI using a separate worker_pool') introduce separate worker_pool
for HIGHPRI. Although there is NR_WORKER_POOLS enum value which represent
size of pools, definition of worker_pool in gcwq doesn't use it.
Using it makes code robust and prevent future mistakes.
So change code to use this enum value.

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4fef9527a620..49d8f4a0110d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -183,7 +183,8 @@ struct global_cwq {
 	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
 						/* L: hash of busy workers */
 
-	struct worker_pool	pools[2];	/* normal and highpri pools */
+	struct worker_pool	pools[NR_WORKER_POOLS];
+						/* normal and highpri pools */
 
 	wait_queue_head_t	rebind_hold;	/* rebind hold wait */
 } ____cacheline_aligned_in_smp;
-- 
cgit v1.2.3


From b75cac9368fa91636e17d0f7950b35d837154e14 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Wed, 15 Aug 2012 23:25:37 +0900
Subject: workqueue: correct req_cpu in trace_workqueue_queue_work()

When we do tracing workqueue_queue_work(), it records requested cpu.
But, if !(@wq->flag & WQ_UNBOUND) and @cpu is WORK_CPU_UNBOUND,
requested cpu is changed as local cpu.
In case of @wq->flag & WQ_UNBOUND, above change is not occured,
therefore it is reasonable to correct it.

Use temporary local variable for storing requested cpu.

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 49d8f4a0110d..c29f2dc0f4fc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1198,6 +1198,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	struct cpu_workqueue_struct *cwq;
 	struct list_head *worklist;
 	unsigned int work_flags;
+	unsigned int req_cpu = cpu;
 
 	/*
 	 * While a work item is PENDING && off queue, a task trying to
@@ -1253,7 +1254,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 
 	/* gcwq determined, get cwq and queue */
 	cwq = get_cwq(gcwq->cpu, wq);
-	trace_workqueue_queue_work(cpu, cwq, work);
+	trace_workqueue_queue_work(req_cpu, cwq, work);
 
 	if (WARN_ON(!list_empty(&work->entry))) {
 		spin_unlock(&gcwq->lock);
-- 
cgit v1.2.3


From e42986de481238204f6e0b0f4434da428895c20b Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Wed, 15 Aug 2012 23:25:38 +0900
Subject: workqueue: change value of lcpu in __queue_delayed_work_on()

We assign cpu id into work struct's data field in __queue_delayed_work_on().
In current implementation, when work is come in first time,
current running cpu id is assigned.
If we do __queue_delayed_work_on() with CPU A on CPU B,
__queue_work() invoked in delayed_work_timer_fn() go into
the following sub-optimal path in case of WQ_NON_REENTRANT.

	gcwq = get_gcwq(cpu);
	if (wq->flags & WQ_NON_REENTRANT &&
		(last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {

Change lcpu to @cpu and rechange lcpu to local cpu if lcpu is WORK_CPU_UNBOUND.
It is sufficient to prevent to go into sub-optimal path.

tj: Slightly rephrased the comment.

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c29f2dc0f4fc..99ee9b939264 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1356,9 +1356,15 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	if (!(wq->flags & WQ_UNBOUND)) {
 		struct global_cwq *gcwq = get_work_gcwq(work);
 
-		if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
+		/*
+		 * If we cannot get the last gcwq from @work directly,
+		 * select the last CPU such that it avoids unnecessarily
+		 * triggering non-reentrancy check in __queue_work().
+		 */
+		lcpu = cpu;
+		if (gcwq)
 			lcpu = gcwq->cpu;
-		else
+		if (lcpu == WORK_CPU_UNBOUND)
 			lcpu = raw_smp_processor_id();
 	} else {
 		lcpu = WORK_CPU_UNBOUND;
-- 
cgit v1.2.3


From 1aabe902ca3638d862bf0dad5a697d3a8e046b0a Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Wed, 15 Aug 2012 23:25:39 +0900
Subject: workqueue: introduce system_highpri_wq

Commit 3270476a6c0ce322354df8679652f060d66526dc ('workqueue: reimplement
WQ_HIGHPRI using a separate worker_pool') introduce separate worker pool
for HIGHPRI. When we handle busyworkers for gcwq, it can be normal worker
or highpri worker. But, we don't consider this difference in rebind_workers(),
we use just system_wq for highpri worker. It makes mismatch between
cwq->pool and worker->pool.

It doesn't make error in current implementation, but possible in the future.
Now, we introduce system_highpri_wq to use proper cwq for highpri workers
in rebind_workers(). Following patch fix this issue properly.

tj: Even apart from rebinding, having system_highpri_wq generally
    makes sense.

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 99ee9b939264..329c404b68c2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -269,12 +269,14 @@ struct workqueue_struct {
 };
 
 struct workqueue_struct *system_wq __read_mostly;
+struct workqueue_struct *system_highpri_wq __read_mostly;
 struct workqueue_struct *system_long_wq __read_mostly;
 struct workqueue_struct *system_nrt_wq __read_mostly;
 struct workqueue_struct *system_unbound_wq __read_mostly;
 struct workqueue_struct *system_freezable_wq __read_mostly;
 struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
+EXPORT_SYMBOL_GPL(system_highpri_wq);
 EXPORT_SYMBOL_GPL(system_long_wq);
 EXPORT_SYMBOL_GPL(system_nrt_wq);
 EXPORT_SYMBOL_GPL(system_unbound_wq);
@@ -3928,6 +3930,7 @@ static int __init init_workqueues(void)
 	}
 
 	system_wq = alloc_workqueue("events", 0, 0);
+	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
 	system_long_wq = alloc_workqueue("events_long", 0, 0);
 	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
@@ -3936,9 +3939,9 @@ static int __init init_workqueues(void)
 					      WQ_FREEZABLE, 0);
 	system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable",
 			WQ_NON_REENTRANT | WQ_FREEZABLE, 0);
-	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
-	       !system_unbound_wq || !system_freezable_wq ||
-		!system_nrt_freezable_wq);
+	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
+	       !system_nrt_wq || !system_unbound_wq || !system_freezable_wq ||
+	       !system_nrt_freezable_wq);
 	return 0;
 }
 early_initcall(init_workqueues);
-- 
cgit v1.2.3


From e2b6a6d570f070aa90ac00d2d10b1488512f8520 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Wed, 15 Aug 2012 23:25:40 +0900
Subject: workqueue: use system_highpri_wq for highpri workers in
 rebind_workers()

In rebind_workers(), we do inserting a work to rebind to cpu for busy workers.
Currently, in this case, we use only system_wq. This makes a possible
error situation as there is mismatch between cwq->pool and worker->pool.

To prevent this, we should use system_highpri_wq for highpri worker
to match theses. This implements it.

tj: Rephrased comment a bit.

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 329c404b68c2..8936761b814a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1741,6 +1741,7 @@ retry:
 	/* rebind busy workers */
 	for_each_busy_worker(worker, i, pos, gcwq) {
 		struct work_struct *rebind_work = &worker->rebind_work;
+		struct workqueue_struct *wq;
 
 		/* morph UNBOUND to REBIND */
 		worker->flags &= ~WORKER_UNBOUND;
@@ -1750,11 +1751,20 @@ retry:
 				     work_data_bits(rebind_work)))
 			continue;
 
-		/* wq doesn't matter, use the default one */
 		debug_work_activate(rebind_work);
-		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
-			    worker->scheduled.next,
-			    work_color_to_flags(WORK_NO_COLOR));
+
+		/*
+		 * wq doesn't really matter but let's keep @worker->pool
+		 * and @cwq->pool consistent for sanity.
+		 */
+		if (worker_pool_pri(worker->pool))
+			wq = system_highpri_wq;
+		else
+			wq = system_wq;
+
+		insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
+			worker->scheduled.next,
+			work_color_to_flags(WORK_NO_COLOR));
 	}
 }
 
-- 
cgit v1.2.3


From 7635d2fd7f0fa63b6ec03050614c314d7139f14a Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Wed, 15 Aug 2012 23:25:41 +0900
Subject: workqueue: use system_highpri_wq for unbind_work

To speed cpu down processing up, use system_highpri_wq.
As scheduling priority of workers on it is higher than system_wq and
it is not contended by other normal works on this cpu, work on it
is processed faster than system_wq.

tj: CPU up/downs care quite a bit about latency these days.  This
    shouldn't hurt anything and makes sense.

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8936761b814a..7da24711038f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3680,7 +3680,7 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 		/* unbinding should happen on the local CPU */
 		INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
-		schedule_work_on(cpu, &unbind_work);
+		queue_work_on(cpu, system_highpri_wq, &unbind_work);
 		flush_work(&unbind_work);
 		break;
 	}
-- 
cgit v1.2.3


From 044c782ce3a901fbd17cbe701c592f582381174d Mon Sep 17 00:00:00 2001
From: Valentin Ilie <valentin.ilie@gmail.com>
Date: Sun, 19 Aug 2012 00:52:42 +0300
Subject: workqueue: fix checkpatch issues

Fixed some checkpatch warnings.

tj: adapted to wq/for-3.7 and massaged pr_xxx() format strings a bit.

Signed-off-by: Valentin Ilie <valentin.ilie@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <1345326762-21747-1-git-send-email-valentin.ilie@gmail.com>
---
 kernel/workqueue.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7da24711038f..de429ba000ee 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -269,18 +269,18 @@ struct workqueue_struct {
 };
 
 struct workqueue_struct *system_wq __read_mostly;
-struct workqueue_struct *system_highpri_wq __read_mostly;
-struct workqueue_struct *system_long_wq __read_mostly;
-struct workqueue_struct *system_nrt_wq __read_mostly;
-struct workqueue_struct *system_unbound_wq __read_mostly;
-struct workqueue_struct *system_freezable_wq __read_mostly;
-struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
+struct workqueue_struct *system_highpri_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_highpri_wq);
+struct workqueue_struct *system_long_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_long_wq);
+struct workqueue_struct *system_nrt_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_nrt_wq);
+struct workqueue_struct *system_unbound_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_unbound_wq);
+struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
+struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 
 #define CREATE_TRACE_POINTS
@@ -2232,11 +2232,9 @@ __acquires(&gcwq->lock)
 	lock_map_release(&cwq->wq->lockdep_map);
 
 	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
-		printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
-		       "%s/0x%08x/%d\n",
-		       current->comm, preempt_count(), task_pid_nr(current));
-		printk(KERN_ERR "    last function: ");
-		print_symbol("%s\n", (unsigned long)f);
+		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
+		       "     last function: %pf\n",
+		       current->comm, preempt_count(), task_pid_nr(current), f);
 		debug_show_held_locks(current);
 		dump_stack();
 	}
@@ -2790,8 +2788,8 @@ reflush:
 
 		if (++flush_cnt == 10 ||
 		    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
-			pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
-				   wq->name, flush_cnt);
+			pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
+				wq->name, flush_cnt);
 		goto reflush;
 	}
 
@@ -3275,9 +3273,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
 	int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
 
 	if (max_active < 1 || max_active > lim)
-		printk(KERN_WARNING "workqueue: max_active %d requested for %s "
-		       "is out of range, clamping between %d and %d\n",
-		       max_active, name, 1, lim);
+		pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
+			max_active, name, 1, lim);
 
 	return clamp_val(max_active, 1, lim);
 }
-- 
cgit v1.2.3


From dbf2576e37da0fcc7aacbfbb9fd5d3de7888a3c1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 20 Aug 2012 14:51:23 -0700
Subject: workqueue: make all workqueues non-reentrant

By default, each per-cpu part of a bound workqueue operates separately
and a work item may be executing concurrently on different CPUs.  The
behavior avoids some cross-cpu traffic but leads to subtle weirdities
and not-so-subtle contortions in the API.

* There's no sane usefulness in allowing a single work item to be
  executed concurrently on multiple CPUs.  People just get the
  behavior unintentionally and get surprised after learning about it.
  Most either explicitly synchronize or use non-reentrant/ordered
  workqueue but this is error-prone.

* flush_work() can't wait for multiple instances of the same work item
  on different CPUs.  If a work item is executing on cpu0 and then
  queued on cpu1, flush_work() can only wait for the one on cpu1.

  Unfortunately, work items can easily cross CPU boundaries
  unintentionally when the queueing thread gets migrated.  This means
  that if multiple queuers compete, flush_work() can't even guarantee
  that the instance queued right before it is finished before
  returning.

* flush_work_sync() was added to work around some of the deficiencies
  of flush_work().  In addition to the usual flushing, it ensures that
  all currently executing instances are finished before returning.
  This operation is expensive as it has to walk all CPUs and at the
  same time fails to address competing queuer case.

  Incorrectly using flush_work() when flush_work_sync() is necessary
  is an easy error to make and can lead to bugs which are difficult to
  reproduce.

* Similar problems exist for flush_delayed_work[_sync]().

Other than the cross-cpu access concern, there's no benefit in
allowing parallel execution and it's plain silly to have this level of
contortion for workqueue which is widely used from core code to
extremely obscure drivers.

This patch makes all workqueues non-reentrant.  If a work item is
executing on a different CPU when queueing is requested, it is always
queued to that CPU.  This guarantees that any given work item can be
executing on one CPU at maximum and if a work item is queued and
executing, both are on the same CPU.

The only behavior change which may affect workqueue users negatively
is that non-reentrancy overrides the affinity specified by
queue_work_on().  On a reentrant workqueue, the affinity specified by
queue_work_on() is always followed.  Now, if the work item is
executing on one of the CPUs, the work item will be queued there
regardless of the requested affinity.  I've reviewed all workqueue
users which request explicit affinity, and, fortunately, none seems to
be crazy enough to exploit parallel execution of the same work item.

This adds an additional busy_hash lookup if the work item was
previously queued on a different CPU.  This shouldn't be noticeable
under any sane workload.  Work item queueing isn't a very
high-frequency operation and they don't jump across CPUs all the time.
In a micro benchmark to exaggerate this difference - measuring the
time it takes for two work items to repeatedly jump between two CPUs a
number (10M) of times with busy_hash table densely populated, the
difference was around 3%.

While the overhead is measureable, it is only visible in pathological
cases and the difference isn't huge.  This change brings much needed
sanity to workqueue and makes its behavior consistent with timer.  I
think this is the right tradeoff to make.

This enables significant simplification of workqueue API.
Simplification patches will follow.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index de429ba000ee..c4feef9798ea 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1225,14 +1225,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			cpu = raw_smp_processor_id();
 
 		/*
-		 * It's multi cpu.  If @wq is non-reentrant and @work
-		 * was previously on a different cpu, it might still
-		 * be running there, in which case the work needs to
-		 * be queued on that cpu to guarantee non-reentrance.
+		 * It's multi cpu.  If @work was previously on a different
+		 * cpu, it might still be running there, in which case the
+		 * work needs to be queued on that cpu to guarantee
+		 * non-reentrancy.
 		 */
 		gcwq = get_gcwq(cpu);
-		if (wq->flags & WQ_NON_REENTRANT &&
-		    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
+		last_gcwq = get_work_gcwq(work);
+
+		if (last_gcwq && last_gcwq != gcwq) {
 			struct worker *worker;
 
 			spin_lock(&last_gcwq->lock);
-- 
cgit v1.2.3


From 606a5020b9bdceb20b4f43e11db0054afa349028 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 20 Aug 2012 14:51:23 -0700
Subject: workqueue: gut flush[_delayed]_work_sync()

Now that all workqueues are non-reentrant, flush[_delayed]_work_sync()
are equivalent to flush[_delayed]_work().  Drop the separate
implementation and make them thin wrappers around
flush[_delayed]_work().

* start_flush_work() no longer takes @wait_executing as the only left
  user - flush_work() - always sets it to %true.

* __cancel_work_timer() uses flush_work() instead of wait_on_work().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 122 +++++------------------------------------------------
 1 file changed, 10 insertions(+), 112 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c4feef9798ea..5f13a9a2c792 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2801,8 +2801,7 @@ reflush:
 }
 EXPORT_SYMBOL_GPL(drain_workqueue);
 
-static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
-			     bool wait_executing)
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 {
 	struct worker *worker = NULL;
 	struct global_cwq *gcwq;
@@ -2824,13 +2823,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 		cwq = get_work_cwq(work);
 		if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
 			goto already_gone;
-	} else if (wait_executing) {
+	} else {
 		worker = find_worker_executing_work(gcwq, work);
 		if (!worker)
 			goto already_gone;
 		cwq = worker->current_cwq;
-	} else
-		goto already_gone;
+	}
 
 	insert_wq_barrier(cwq, barr, work, worker);
 	spin_unlock_irq(&gcwq->lock);
@@ -2857,15 +2855,8 @@ already_gone:
  * flush_work - wait for a work to finish executing the last queueing instance
  * @work: the work to flush
  *
- * Wait until @work has finished execution.  This function considers
- * only the last queueing instance of @work.  If @work has been
- * enqueued across different CPUs on a non-reentrant workqueue or on
- * multiple workqueues, @work might still be executing on return on
- * some of the CPUs from earlier queueing.
- *
- * If @work was queued only on a non-reentrant, ordered or unbound
- * workqueue, @work is guaranteed to be idle on return if it hasn't
- * been requeued since flush started.
+ * Wait until @work has finished execution.  @work is guaranteed to be idle
+ * on return if it hasn't been requeued since flush started.
  *
  * RETURNS:
  * %true if flush_work() waited for the work to finish execution,
@@ -2878,85 +2869,15 @@ bool flush_work(struct work_struct *work)
 	lock_map_acquire(&work->lockdep_map);
 	lock_map_release(&work->lockdep_map);
 
-	if (start_flush_work(work, &barr, true)) {
+	if (start_flush_work(work, &barr)) {
 		wait_for_completion(&barr.done);
 		destroy_work_on_stack(&barr.work);
 		return true;
-	} else
-		return false;
-}
-EXPORT_SYMBOL_GPL(flush_work);
-
-static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
-{
-	struct wq_barrier barr;
-	struct worker *worker;
-
-	spin_lock_irq(&gcwq->lock);
-
-	worker = find_worker_executing_work(gcwq, work);
-	if (unlikely(worker))
-		insert_wq_barrier(worker->current_cwq, &barr, work, worker);
-
-	spin_unlock_irq(&gcwq->lock);
-
-	if (unlikely(worker)) {
-		wait_for_completion(&barr.done);
-		destroy_work_on_stack(&barr.work);
-		return true;
-	} else
+	} else {
 		return false;
-}
-
-static bool wait_on_work(struct work_struct *work)
-{
-	bool ret = false;
-	int cpu;
-
-	might_sleep();
-
-	lock_map_acquire(&work->lockdep_map);
-	lock_map_release(&work->lockdep_map);
-
-	for_each_gcwq_cpu(cpu)
-		ret |= wait_on_cpu_work(get_gcwq(cpu), work);
-	return ret;
-}
-
-/**
- * flush_work_sync - wait until a work has finished execution
- * @work: the work to flush
- *
- * Wait until @work has finished execution.  On return, it's
- * guaranteed that all queueing instances of @work which happened
- * before this function is called are finished.  In other words, if
- * @work hasn't been requeued since this function was called, @work is
- * guaranteed to be idle on return.
- *
- * RETURNS:
- * %true if flush_work_sync() waited for the work to finish execution,
- * %false if it was already idle.
- */
-bool flush_work_sync(struct work_struct *work)
-{
-	struct wq_barrier barr;
-	bool pending, waited;
-
-	/* we'll wait for executions separately, queue barr only if pending */
-	pending = start_flush_work(work, &barr, false);
-
-	/* wait for executions to finish */
-	waited = wait_on_work(work);
-
-	/* wait for the pending one */
-	if (pending) {
-		wait_for_completion(&barr.done);
-		destroy_work_on_stack(&barr.work);
 	}
-
-	return pending || waited;
 }
-EXPORT_SYMBOL_GPL(flush_work_sync);
+EXPORT_SYMBOL_GPL(flush_work);
 
 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 {
@@ -2970,14 +2891,14 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 		 * would be waiting for before retrying.
 		 */
 		if (unlikely(ret == -ENOENT))
-			wait_on_work(work);
+			flush_work(work);
 	} while (unlikely(ret < 0));
 
 	/* tell other tasks trying to grab @work to back off */
 	mark_work_canceling(work);
 	local_irq_restore(flags);
 
-	wait_on_work(work);
+	flush_work(work);
 	clear_work_data(work);
 	return ret;
 }
@@ -3029,29 +2950,6 @@ bool flush_delayed_work(struct delayed_work *dwork)
 }
 EXPORT_SYMBOL(flush_delayed_work);
 
-/**
- * flush_delayed_work_sync - wait for a dwork to finish
- * @dwork: the delayed work to flush
- *
- * Delayed timer is cancelled and the pending work is queued for
- * execution immediately.  Other than timer handling, its behavior
- * is identical to flush_work_sync().
- *
- * RETURNS:
- * %true if flush_work_sync() waited for the work to finish execution,
- * %false if it was already idle.
- */
-bool flush_delayed_work_sync(struct delayed_work *dwork)
-{
-	local_irq_disable();
-	if (del_timer_sync(&dwork->timer))
-		__queue_work(dwork->cpu,
-			     get_work_cwq(&dwork->work)->wq, &dwork->work);
-	local_irq_enable();
-	return flush_work_sync(&dwork->work);
-}
-EXPORT_SYMBOL(flush_delayed_work_sync);
-
 /**
  * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
  * @dwork: the delayed work cancel
-- 
cgit v1.2.3


From ae930e0f4e66fd540c6fbad9f1e2a7743d8b9afe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 20 Aug 2012 14:51:23 -0700
Subject: workqueue: gut system_nrt[_freezable]_wq()

Now that all workqueues are non-reentrant, system[_freezable]_wq() are
equivalent to system_nrt[_freezable]_wq().  Replace the latter with
wrappers around system[_freezable]_wq().  The wrapping goes through
inline functions so that __deprecated can be added easily.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5f13a9a2c792..85bd3409b9f5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -274,14 +274,10 @@ struct workqueue_struct *system_highpri_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_highpri_wq);
 struct workqueue_struct *system_long_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_long_wq);
-struct workqueue_struct *system_nrt_wq __read_mostly;
-EXPORT_SYMBOL_GPL(system_nrt_wq);
 struct workqueue_struct *system_unbound_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_unbound_wq);
 struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
-struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
-EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
@@ -3838,16 +3834,12 @@ static int __init init_workqueues(void)
 	system_wq = alloc_workqueue("events", 0, 0);
 	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
 	system_long_wq = alloc_workqueue("events_long", 0, 0);
-	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
 					    WQ_UNBOUND_MAX_ACTIVE);
 	system_freezable_wq = alloc_workqueue("events_freezable",
 					      WQ_FREEZABLE, 0);
-	system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable",
-			WQ_NON_REENTRANT | WQ_FREEZABLE, 0);
 	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
-	       !system_nrt_wq || !system_unbound_wq || !system_freezable_wq ||
-	       !system_nrt_freezable_wq);
+	       !system_unbound_wq || !system_freezable_wq);
 	return 0;
 }
 early_initcall(init_workqueues);
-- 
cgit v1.2.3


From e0aecdd874d78b7129a64b056c20e529e2c916df Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Aug 2012 13:18:24 -0700
Subject: workqueue: use irqsafe timer for delayed_work

Up to now, for delayed_works, try_to_grab_pending() couldn't be used
from IRQ handlers because IRQs may happen while
delayed_work_timer_fn() is in progress leading to indefinite -EAGAIN.

This patch makes delayed_work use the new TIMER_IRQSAFE flag for
delayed_work->timer.  This makes try_to_grab_pending() and thus
mod_delayed_work_on() safe to call from IRQ handlers.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 85bd3409b9f5..b394df8beaee 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1048,16 +1048,14 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
  *		for arbitrarily long
  *
  * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
- * preempted while holding PENDING and @work off queue, preemption must be
- * disabled on entry.  This ensures that we don't return -EAGAIN while
- * another task is preempted in this function.
+ * interrupted while holding PENDING and @work off queue, irq must be
+ * disabled on entry.  This, combined with delayed_work->timer being
+ * irqsafe, ensures that we return -EAGAIN for finite short period of time.
  *
  * On successful return, >= 0, irq is disabled and the caller is
  * responsible for releasing it using local_irq_restore(*@flags).
  *
- * This function is safe to call from any context other than IRQ handler.
- * An IRQ handler may run on top of delayed_work_timer_fn() which can make
- * this function return -EAGAIN perpetually.
+ * This function is safe to call from any context including IRQ handler.
  */
 static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 			       unsigned long *flags)
@@ -1072,6 +1070,11 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 	if (is_dwork) {
 		struct delayed_work *dwork = to_delayed_work(work);
 
+		/*
+		 * dwork->timer is irqsafe.  If del_timer() fails, it's
+		 * guaranteed that the timer is not queued anywhere and not
+		 * running on the local CPU.
+		 */
 		if (likely(del_timer(&dwork->timer)))
 			return 1;
 	}
@@ -1327,9 +1330,8 @@ void delayed_work_timer_fn(unsigned long __data)
 	struct delayed_work *dwork = (struct delayed_work *)__data;
 	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
 
-	local_irq_disable();
+	/* should have been called from irqsafe timer with irq already off */
 	__queue_work(dwork->cpu, cwq->wq, &dwork->work);
-	local_irq_enable();
 }
 EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
 
@@ -1444,7 +1446,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
  * Returns %false if @dwork was idle and queued, %true if @dwork was
  * pending and its timer was modified.
  *
- * This function is safe to call from any context other than IRQ handler.
+ * This function is safe to call from any context including IRQ handler.
  * See try_to_grab_pending() for details.
  */
 bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
-- 
cgit v1.2.3


From 57b30ae77bf00d2318df711ef9a4d2a9be0a3a2a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Aug 2012 13:18:24 -0700
Subject: workqueue: reimplement cancel_delayed_work() using
 try_to_grab_pending()

cancel_delayed_work() can't be called from IRQ handlers due to its use
of del_timer_sync() and can't cancel work items which are already
transferred from timer to worklist.

Also, unlike other flush and cancel functions, a canceled delayed_work
would still point to the last associated cpu_workqueue.  If the
workqueue is destroyed afterwards and the work item is re-used on a
different workqueue, the queueing code can oops trying to dereference
already freed cpu_workqueue.

This patch reimplements cancel_delayed_work() using
try_to_grab_pending() and set_work_cpu_and_clear_pending().  This
allows the function to be called from IRQ handlers and makes its
behavior consistent with other flush / cancel functions.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/workqueue.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b394df8beaee..039d0fae171a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2948,6 +2948,36 @@ bool flush_delayed_work(struct delayed_work *dwork)
 }
 EXPORT_SYMBOL(flush_delayed_work);
 
+/**
+ * cancel_delayed_work - cancel a delayed work
+ * @dwork: delayed_work to cancel
+ *
+ * Kill off a pending delayed_work.  Returns %true if @dwork was pending
+ * and canceled; %false if wasn't pending.  Note that the work callback
+ * function may still be running on return, unless it returns %true and the
+ * work doesn't re-arm itself.  Explicitly flush or use
+ * cancel_delayed_work_sync() to wait on it.
+ *
+ * This function is safe to call from any context including IRQ handler.
+ */
+bool cancel_delayed_work(struct delayed_work *dwork)
+{
+	unsigned long flags;
+	int ret;
+
+	do {
+		ret = try_to_grab_pending(&dwork->work, true, &flags);
+	} while (unlikely(ret == -EAGAIN));
+
+	if (unlikely(ret < 0))
+		return false;
+
+	set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
+	local_irq_restore(flags);
+	return true;
+}
+EXPORT_SYMBOL(cancel_delayed_work);
+
 /**
  * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
  * @dwork: the delayed work cancel
-- 
cgit v1.2.3


From ea1abd6197d5805655da1bb589929762f4b4aa08 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 18 Sep 2012 09:59:22 -0700
Subject: workqueue: reimplement idle worker rebinding

Currently rebind_workers() uses rebinds idle workers synchronously
before proceeding to requesting busy workers to rebind.  This is
necessary because all workers on @worker_pool->idle_list must be bound
before concurrency management local wake-ups from the busy workers
take place.

Unfortunately, the synchronous idle rebinding is quite complicated.
This patch reimplements idle rebinding to simplify the code path.

Rather than trying to make all idle workers bound before rebinding
busy workers, we simply remove all to-be-bound idle workers from the
idle list and let them add themselves back after completing rebinding
(successful or not).

As only workers which finished rebinding can on on the idle worker
list, the idle worker list is guaranteed to have only bound workers
unless CPU went down again and local wake-ups are safe.

After the change, @worker_pool->nr_idle may deviate than the actual
number of idle workers on @worker_pool->idle_list.  More specifically,
nr_idle may be non-zero while ->idle_list is empty.  All users of
->nr_idle and ->idle_list are audited.  The only affected one is
too_many_workers() which is updated to check %false if ->idle_list is
empty regardless of ->nr_idle.

After this patch, rebind_workers() no longer performs the nasty
idle-rebind retries which require temporary release of gcwq->lock, and
both unbinding and rebinding are atomic w.r.t. global_cwq->lock.

worker->idle_rebind and global_cwq->rebind_hold are now unnecessary
and removed along with the definition of struct idle_rebind.

Changed from V1:
	1) remove unlikely from too_many_workers(), ->idle_list can be empty
	   anytime, even before this patch, no reason to use unlikely.
	2) fix a small rebasing mistake.
	   (which is from rebasing the orignal fixing patch to for-next)
	3) add a lot of comments.
	4) clear WORKER_REBIND unconditionaly in idle_worker_rebind()

tj: Updated comments and description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 141 ++++++++++++++++-------------------------------------
 1 file changed, 42 insertions(+), 99 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 31d8a4586d4c..770c1a8128bf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -126,7 +126,6 @@ enum {
 
 struct global_cwq;
 struct worker_pool;
-struct idle_rebind;
 
 /*
  * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -150,7 +149,6 @@ struct worker {
 	int			id;		/* I: worker id */
 
 	/* for rebinding worker to CPU */
-	struct idle_rebind	*idle_rebind;	/* L: for idle worker */
 	struct work_struct	rebind_work;	/* L: for busy worker */
 };
 
@@ -160,6 +158,8 @@ struct worker_pool {
 
 	struct list_head	worklist;	/* L: list of pending works */
 	int			nr_workers;	/* L: total number of workers */
+
+	/* nr_idle includes the ones off idle_list for rebinding */
 	int			nr_idle;	/* L: currently idle ones */
 
 	struct list_head	idle_list;	/* X: list of idle workers */
@@ -186,8 +186,6 @@ struct global_cwq {
 
 	struct worker_pool	pools[NR_WORKER_POOLS];
 						/* normal and highpri pools */
-
-	wait_queue_head_t	rebind_hold;	/* rebind hold wait */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -687,6 +685,13 @@ static bool too_many_workers(struct worker_pool *pool)
 	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
 	int nr_busy = pool->nr_workers - nr_idle;
 
+	/*
+	 * nr_idle and idle_list may disagree if idle rebinding is in
+	 * progress.  Never return %true if idle_list is empty.
+	 */
+	if (list_empty(&pool->idle_list))
+		return false;
+
 	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
 
@@ -1611,37 +1616,26 @@ __acquires(&gcwq->lock)
 	}
 }
 
-struct idle_rebind {
-	int			cnt;		/* # workers to be rebound */
-	struct completion	done;		/* all workers rebound */
-};
-
 /*
- * Rebind an idle @worker to its CPU.  During CPU onlining, this has to
- * happen synchronously for idle workers.  worker_thread() will test
+ * Rebind an idle @worker to its CPU.  worker_thread() will test
  * %WORKER_REBIND before leaving idle and call this function.
  */
 static void idle_worker_rebind(struct worker *worker)
 {
 	struct global_cwq *gcwq = worker->pool->gcwq;
 
-	/* CPU must be online at this point */
-	WARN_ON(!worker_maybe_bind_and_lock(worker));
-	if (!--worker->idle_rebind->cnt)
-		complete(&worker->idle_rebind->done);
-	spin_unlock_irq(&worker->pool->gcwq->lock);
-
-	/* we did our part, wait for rebind_workers() to finish up */
-	wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
-
 	/*
-	 * rebind_workers() shouldn't finish until all workers passed the
-	 * above WORKER_REBIND wait.  Tell it when done.
+	 * CPU may go down again inbetween.  If rebinding fails, reinstate
+	 * UNBOUND.  We're off idle_list and nobody else can do it for us.
 	 */
-	spin_lock_irq(&worker->pool->gcwq->lock);
-	if (!--worker->idle_rebind->cnt)
-		complete(&worker->idle_rebind->done);
-	spin_unlock_irq(&worker->pool->gcwq->lock);
+	if (!worker_maybe_bind_and_lock(worker))
+		worker->flags |= WORKER_UNBOUND;
+
+	worker_clr_flags(worker, WORKER_REBIND);
+
+	/* rebind complete, become available again */
+	list_add(&worker->entry, &worker->pool->idle_list);
+	spin_unlock_irq(&gcwq->lock);
 }
 
 /*
@@ -1676,29 +1670,25 @@ static void busy_worker_rebind_fn(struct work_struct *work)
  * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
  * is different for idle and busy ones.
  *
- * The idle ones should be rebound synchronously and idle rebinding should
- * be complete before any worker starts executing work items with
- * concurrency management enabled; otherwise, scheduler may oops trying to
- * wake up non-local idle worker from wq_worker_sleeping().
- *
- * This is achieved by repeatedly requesting rebinding until all idle
- * workers are known to have been rebound under @gcwq->lock and holding all
- * idle workers from becoming busy until idle rebinding is complete.
+ * Idle ones will be removed from the idle_list and woken up.  They will
+ * add themselves back after completing rebind.  This ensures that the
+ * idle_list doesn't contain any unbound workers when re-bound busy workers
+ * try to perform local wake-ups for concurrency management.
  *
- * Once idle workers are rebound, busy workers can be rebound as they
- * finish executing their current work items.  Queueing the rebind work at
- * the head of their scheduled lists is enough.  Note that nr_running will
- * be properbly bumped as busy workers rebind.
+ * Busy workers can rebind after they finish their current work items.
+ * Queueing the rebind work item at the head of the scheduled list is
+ * enough.  Note that nr_running will be properly bumped as busy workers
+ * rebind.
  *
- * On return, all workers are guaranteed to either be bound or have rebind
- * work item scheduled.
+ * On return, all non-manager workers are scheduled for rebind - see
+ * manage_workers() for the manager special case.  Any idle worker
+ * including the manager will not appear on @idle_list until rebind is
+ * complete, making local wake-ups safe.
  */
 static void rebind_workers(struct global_cwq *gcwq)
-	__releases(&gcwq->lock) __acquires(&gcwq->lock)
 {
-	struct idle_rebind idle_rebind;
 	struct worker_pool *pool;
-	struct worker *worker;
+	struct worker *worker, *n;
 	struct hlist_node *pos;
 	int i;
 
@@ -1707,46 +1697,29 @@ static void rebind_workers(struct global_cwq *gcwq)
 	for_each_worker_pool(pool, gcwq)
 		lockdep_assert_held(&pool->manager_mutex);
 
-	/*
-	 * Rebind idle workers.  Interlocked both ways.  We wait for
-	 * workers to rebind via @idle_rebind.done.  Workers will wait for
-	 * us to finish up by watching %WORKER_REBIND.
-	 */
-	init_completion(&idle_rebind.done);
-retry:
-	idle_rebind.cnt = 1;
-	INIT_COMPLETION(idle_rebind.done);
-
-	/* set REBIND and kick idle ones, we'll wait for these later */
+	/* set REBIND and kick idle ones */
 	for_each_worker_pool(pool, gcwq) {
-		list_for_each_entry(worker, &pool->idle_list, entry) {
+		list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
 			unsigned long worker_flags = worker->flags;
 
-			if (worker->flags & WORKER_REBIND)
-				continue;
-
 			/* morph UNBOUND to REBIND atomically */
 			worker_flags &= ~WORKER_UNBOUND;
 			worker_flags |= WORKER_REBIND;
 			ACCESS_ONCE(worker->flags) = worker_flags;
 
-			idle_rebind.cnt++;
-			worker->idle_rebind = &idle_rebind;
+			/*
+			 * idle workers should be off @pool->idle_list
+			 * until rebind is complete to avoid receiving
+			 * premature local wake-ups.
+			 */
+			list_del_init(&worker->entry);
 
 			/* worker_thread() will call idle_worker_rebind() */
 			wake_up_process(worker->task);
 		}
 	}
 
-	if (--idle_rebind.cnt) {
-		spin_unlock_irq(&gcwq->lock);
-		wait_for_completion(&idle_rebind.done);
-		spin_lock_irq(&gcwq->lock);
-		/* busy ones might have become idle while waiting, retry */
-		goto retry;
-	}
-
-	/* all idle workers are rebound, rebind busy workers */
+	/* rebind busy workers */
 	for_each_busy_worker(worker, i, pos, gcwq) {
 		unsigned long worker_flags = worker->flags;
 		struct work_struct *rebind_work = &worker->rebind_work;
@@ -1776,34 +1749,6 @@ retry:
 			worker->scheduled.next,
 			work_color_to_flags(WORK_NO_COLOR));
 	}
-
-	/*
-	 * All idle workers are rebound and waiting for %WORKER_REBIND to
-	 * be cleared inside idle_worker_rebind().  Clear and release.
-	 * Clearing %WORKER_REBIND from this foreign context is safe
-	 * because these workers are still guaranteed to be idle.
-	 *
-	 * We need to make sure all idle workers passed WORKER_REBIND wait
-	 * in idle_worker_rebind() before returning; otherwise, workers can
-	 * get stuck at the wait if hotplug cycle repeats.
-	 */
-	idle_rebind.cnt = 1;
-	INIT_COMPLETION(idle_rebind.done);
-
-	for_each_worker_pool(pool, gcwq) {
-		list_for_each_entry(worker, &pool->idle_list, entry) {
-			worker->flags &= ~WORKER_REBIND;
-			idle_rebind.cnt++;
-		}
-	}
-
-	wake_up_all(&gcwq->rebind_hold);
-
-	if (--idle_rebind.cnt) {
-		spin_unlock_irq(&gcwq->lock);
-		wait_for_completion(&idle_rebind.done);
-		spin_lock_irq(&gcwq->lock);
-	}
 }
 
 static struct worker *alloc_worker(void)
@@ -3916,8 +3861,6 @@ static int __init init_workqueues(void)
 			mutex_init(&pool->manager_mutex);
 			ida_init(&pool->worker_ida);
 		}
-
-		init_waitqueue_head(&gcwq->rebind_hold);
 	}
 
 	/* create the initial worker */
-- 
cgit v1.2.3


From eab6d82843ee1df244f8847d1bf8bb89160ec4aa Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 18 Sep 2012 09:59:22 -0700
Subject: workqueue: WORKER_REBIND is no longer necessary for busy rebinding

Because the old unbind/rebinding implementation wasn't atomic w.r.t.
GCWQ_DISASSOCIATED manipulation which is protected by
global_cwq->lock, we had to use two flags, WORKER_UNBOUND and
WORKER_REBIND, to avoid incorrectly losing all NOT_RUNNING bits with
back-to-back CPU hotplug operations; otherwise, completion of
rebinding while another unbinding is in progress could clear UNBIND
prematurely.

Now that both unbind/rebinding are atomic w.r.t. GCWQ_DISASSOCIATED,
there's no need to use two flags.  Just one is enough.  Don't use
WORKER_REBIND for busy rebinding.

tj: Updated description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 770c1a8128bf..794724efb733 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1649,16 +1649,8 @@ static void busy_worker_rebind_fn(struct work_struct *work)
 	struct worker *worker = container_of(work, struct worker, rebind_work);
 	struct global_cwq *gcwq = worker->pool->gcwq;
 
-	worker_maybe_bind_and_lock(worker);
-
-	/*
-	 * %WORKER_REBIND must be cleared even if the above binding failed;
-	 * otherwise, we may confuse the next CPU_UP cycle or oops / get
-	 * stuck by calling idle_worker_rebind() prematurely.  If CPU went
-	 * down again inbetween, %WORKER_UNBOUND would be set, so clearing
-	 * %WORKER_REBIND is always safe.
-	 */
-	worker_clr_flags(worker, WORKER_REBIND);
+	if (worker_maybe_bind_and_lock(worker))
+		worker_clr_flags(worker, WORKER_UNBOUND);
 
 	spin_unlock_irq(&gcwq->lock);
 }
@@ -1721,15 +1713,9 @@ static void rebind_workers(struct global_cwq *gcwq)
 
 	/* rebind busy workers */
 	for_each_busy_worker(worker, i, pos, gcwq) {
-		unsigned long worker_flags = worker->flags;
 		struct work_struct *rebind_work = &worker->rebind_work;
 		struct workqueue_struct *wq;
 
-		/* morph UNBOUND to REBIND atomically */
-		worker_flags &= ~WORKER_UNBOUND;
-		worker_flags |= WORKER_REBIND;
-		ACCESS_ONCE(worker->flags) = worker_flags;
-
 		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
 				     work_data_bits(rebind_work)))
 			continue;
-- 
cgit v1.2.3


From 5f7dabfd5cb115937afb4649e4c73b02f927f6ae Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 18 Sep 2012 09:59:23 -0700
Subject: workqueue: WORKER_REBIND is no longer necessary for idle rebinding

Now both worker destruction and idle rebinding remove the worker from
idle list while it's still idle, so list_empty(&worker->entry) can be
used to test whether either is pending and WORKER_DIE to distinguish
between the two instead making WORKER_REBIND unnecessary.

Use list_empty(&worker->entry) to determine whether destruction or
rebinding is pending.  This simplifies worker state transitions.

WORKER_REBIND is not needed anymore.  Remove it.

tj: Updated comments and description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 794724efb733..cdc6bfc84b78 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -73,11 +73,10 @@ enum {
 	WORKER_DIE		= 1 << 1,	/* die die die */
 	WORKER_IDLE		= 1 << 2,	/* is idle */
 	WORKER_PREP		= 1 << 3,	/* preparing to run works */
-	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
 	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
 	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
 
-	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
+	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_UNBOUND |
 				  WORKER_CPU_INTENSIVE,
 
 	NR_WORKER_POOLS		= 2,		/* # worker pools per gcwq */
@@ -1618,20 +1617,15 @@ __acquires(&gcwq->lock)
 
 /*
  * Rebind an idle @worker to its CPU.  worker_thread() will test
- * %WORKER_REBIND before leaving idle and call this function.
+ * list_empty(@worker->entry) before leaving idle and call this function.
  */
 static void idle_worker_rebind(struct worker *worker)
 {
 	struct global_cwq *gcwq = worker->pool->gcwq;
 
-	/*
-	 * CPU may go down again inbetween.  If rebinding fails, reinstate
-	 * UNBOUND.  We're off idle_list and nobody else can do it for us.
-	 */
-	if (!worker_maybe_bind_and_lock(worker))
-		worker->flags |= WORKER_UNBOUND;
-
-	worker_clr_flags(worker, WORKER_REBIND);
+	/* CPU may go down again inbetween, clear UNBOUND only on success */
+	if (worker_maybe_bind_and_lock(worker))
+		worker_clr_flags(worker, WORKER_UNBOUND);
 
 	/* rebind complete, become available again */
 	list_add(&worker->entry, &worker->pool->idle_list);
@@ -1689,16 +1683,9 @@ static void rebind_workers(struct global_cwq *gcwq)
 	for_each_worker_pool(pool, gcwq)
 		lockdep_assert_held(&pool->manager_mutex);
 
-	/* set REBIND and kick idle ones */
+	/* dequeue and kick idle ones */
 	for_each_worker_pool(pool, gcwq) {
 		list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
-			unsigned long worker_flags = worker->flags;
-
-			/* morph UNBOUND to REBIND atomically */
-			worker_flags &= ~WORKER_UNBOUND;
-			worker_flags |= WORKER_REBIND;
-			ACCESS_ONCE(worker->flags) = worker_flags;
-
 			/*
 			 * idle workers should be off @pool->idle_list
 			 * until rebind is complete to avoid receiving
@@ -1706,7 +1693,10 @@ static void rebind_workers(struct global_cwq *gcwq)
 			 */
 			list_del_init(&worker->entry);
 
-			/* worker_thread() will call idle_worker_rebind() */
+			/*
+			 * worker_thread() will see the above dequeuing
+			 * and call idle_worker_rebind().
+			 */
 			wake_up_process(worker->task);
 		}
 	}
@@ -2176,7 +2166,7 @@ __acquires(&gcwq->lock)
 	 * necessary to avoid spurious warnings from rescuers servicing the
 	 * unbound or a disassociated gcwq.
 	 */
-	WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+	WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
 		     !(gcwq->flags & GCWQ_DISASSOCIATED) &&
 		     raw_smp_processor_id() != gcwq->cpu);
 
@@ -2300,18 +2290,17 @@ static int worker_thread(void *__worker)
 woke_up:
 	spin_lock_irq(&gcwq->lock);
 
-	/*
-	 * DIE can be set only while idle and REBIND set while busy has
-	 * @worker->rebind_work scheduled.  Checking here is enough.
-	 */
-	if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
+	/* we are off idle list if destruction or rebind is requested */
+	if (unlikely(list_empty(&worker->entry))) {
 		spin_unlock_irq(&gcwq->lock);
 
+		/* if DIE is set, destruction is requested */
 		if (worker->flags & WORKER_DIE) {
 			worker->task->flags &= ~PF_WQ_WORKER;
 			return 0;
 		}
 
+		/* otherwise, rebind */
 		idle_worker_rebind(worker);
 		goto woke_up;
 	}
-- 
cgit v1.2.3


From b2eb83d123c1cc9f96a8e452b26a6ebe631b3ad7 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 18 Sep 2012 09:59:23 -0700
Subject: workqueue: rename manager_mutex to assoc_mutex

Now that manager_mutex's role has changed from synchronizing manager
role to excluding hotplug against manager, the name is misleading.

As it is protecting the CPU-association of the gcwq now, rename it to
assoc_mutex.

This patch is pure rename and doesn't introduce any functional change.

tj: Updated comments and description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index cdc6bfc84b78..e651239f1ece 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -58,7 +58,7 @@ enum {
 	 * be executing on any CPU.  The gcwq behaves as an unbound one.
 	 *
 	 * Note that DISASSOCIATED can be flipped only while holding
-	 * managership of all pools on the gcwq to avoid changing binding
+	 * assoc_mutex of all pools on the gcwq to avoid changing binding
 	 * state while create_worker() is in progress.
 	 */
 	GCWQ_DISASSOCIATED	= 1 << 0,	/* cpu can't serve workers */
@@ -165,7 +165,7 @@ struct worker_pool {
 	struct timer_list	idle_timer;	/* L: worker idle timeout */
 	struct timer_list	mayday_timer;	/* L: SOS timer for workers */
 
-	struct mutex		manager_mutex;	/* mutex manager should hold */
+	struct mutex		assoc_mutex;	/* protect GCWQ_DISASSOCIATED */
 	struct ida		worker_ida;	/* L: for worker IDs */
 };
 
@@ -1681,7 +1681,7 @@ static void rebind_workers(struct global_cwq *gcwq)
 	lockdep_assert_held(&gcwq->lock);
 
 	for_each_worker_pool(pool, gcwq)
-		lockdep_assert_held(&pool->manager_mutex);
+		lockdep_assert_held(&pool->assoc_mutex);
 
 	/* dequeue and kick idle ones */
 	for_each_worker_pool(pool, gcwq) {
@@ -2081,22 +2081,22 @@ static bool manage_workers(struct worker *worker)
 	 * grab %POOL_MANAGING_WORKERS to achieve this because that can
 	 * lead to idle worker depletion (all become busy thinking someone
 	 * else is managing) which in turn can result in deadlock under
-	 * extreme circumstances.  Use @pool->manager_mutex to synchronize
+	 * extreme circumstances.  Use @pool->assoc_mutex to synchronize
 	 * manager against CPU hotplug.
 	 *
-	 * manager_mutex would always be free unless CPU hotplug is in
+	 * assoc_mutex would always be free unless CPU hotplug is in
 	 * progress.  trylock first without dropping @gcwq->lock.
 	 */
-	if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
+	if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
 		spin_unlock_irq(&pool->gcwq->lock);
-		mutex_lock(&pool->manager_mutex);
+		mutex_lock(&pool->assoc_mutex);
 		/*
 		 * CPU hotplug could have happened while we were waiting
-		 * for manager_mutex.  Hotplug itself can't handle us
+		 * for assoc_mutex.  Hotplug itself can't handle us
 		 * because manager isn't either on idle or busy list, and
 		 * @gcwq's state and ours could have deviated.
 		 *
-		 * As hotplug is now excluded via manager_mutex, we can
+		 * As hotplug is now excluded via assoc_mutex, we can
 		 * simply try to bind.  It will succeed or fail depending
 		 * on @gcwq's current state.  Try it and adjust
 		 * %WORKER_UNBOUND accordingly.
@@ -2119,7 +2119,7 @@ static bool manage_workers(struct worker *worker)
 	ret |= maybe_create_worker(pool);
 
 	pool->flags &= ~POOL_MANAGING_WORKERS;
-	mutex_unlock(&pool->manager_mutex);
+	mutex_unlock(&pool->assoc_mutex);
 	return ret;
 }
 
@@ -3474,23 +3474,23 @@ EXPORT_SYMBOL_GPL(work_busy);
  */
 
 /* claim manager positions of all pools */
-static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
+static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
 {
 	struct worker_pool *pool;
 
 	for_each_worker_pool(pool, gcwq)
-		mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
+		mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
 	spin_lock_irq(&gcwq->lock);
 }
 
 /* release manager positions */
-static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
+static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
 {
 	struct worker_pool *pool;
 
 	spin_unlock_irq(&gcwq->lock);
 	for_each_worker_pool(pool, gcwq)
-		mutex_unlock(&pool->manager_mutex);
+		mutex_unlock(&pool->assoc_mutex);
 }
 
 static void gcwq_unbind_fn(struct work_struct *work)
@@ -3503,7 +3503,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
 
 	BUG_ON(gcwq->cpu != smp_processor_id());
 
-	gcwq_claim_management_and_lock(gcwq);
+	gcwq_claim_assoc_and_lock(gcwq);
 
 	/*
 	 * We've claimed all manager positions.  Make all workers unbound
@@ -3520,7 +3520,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
 
 	gcwq->flags |= GCWQ_DISASSOCIATED;
 
-	gcwq_release_management_and_unlock(gcwq);
+	gcwq_release_assoc_and_unlock(gcwq);
 
 	/*
 	 * Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3576,10 +3576,10 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		gcwq_claim_management_and_lock(gcwq);
+		gcwq_claim_assoc_and_lock(gcwq);
 		gcwq->flags &= ~GCWQ_DISASSOCIATED;
 		rebind_workers(gcwq);
-		gcwq_release_management_and_unlock(gcwq);
+		gcwq_release_assoc_and_unlock(gcwq);
 		break;
 	}
 	return NOTIFY_OK;
@@ -3833,7 +3833,7 @@ static int __init init_workqueues(void)
 			setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
 				    (unsigned long)pool);
 
-			mutex_init(&pool->manager_mutex);
+			mutex_init(&pool->assoc_mutex);
 			ida_init(&pool->worker_ida);
 		}
 	}
-- 
cgit v1.2.3


From 9fdf9b73d61c87a9c16f101bb8bbe069d13046f5 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 18 Sep 2012 09:59:23 -0700
Subject: workqueue: use __cpuinit instead of __devinit for cpu callbacks

For workqueue hotplug callbacks, it makes less sense to use __devinit
which discards the memory after boot if !HOTPLUG.  __cpuinit, which
discards the memory after boot if !HOTPLUG_CPU fits better.

tj: Updated description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e651239f1ece..942bb750a650 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3548,7 +3548,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
  * Workqueues should be brought up before normal priority CPU notifiers.
  * This will be registered high priority CPU notifier.
  */
-static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 					       unsigned long action,
 					       void *hcpu)
 {
@@ -3589,7 +3589,7 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
  * Workqueues should be brought down after normal priority CPU notifiers.
  * This will be registered as low priority CPU notifier.
  */
-static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 						 unsigned long action,
 						 void *hcpu)
 {
-- 
cgit v1.2.3


From a5b4e57d7cc07cb28ccf16de0876a4770ae84920 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 18 Sep 2012 09:59:23 -0700
Subject: workqueue: use hotcpu_notifier() for workqueue_cpu_down_callback()

workqueue_cpu_down_callback() is used only if HOTPLUG_CPU=y, so
hotcpu_notifier() fits better than cpu_notifier().

When HOTPLUG_CPU=y, hotcpu_notifier() and cpu_notifier() are the same.

When HOTPLUG_CPU=n, if we use cpu_notifier(),
workqueue_cpu_down_callback() will be called during boot to do
nothing, and the memory of workqueue_cpu_down_callback() and
gcwq_unbind_fn() will be discarded after boot.

If we use hotcpu_notifier(), we can avoid the no-op call of
workqueue_cpu_down_callback() and the memory of
workqueue_cpu_down_callback() and gcwq_unbind_fn() will be discard at
build time:

$ ls -l kernel/workqueue.o.cpu_notifier kernel/workqueue.o.hotcpu_notifier
-rw-rw-r-- 1 laijs laijs 484080 Sep 15 11:31 kernel/workqueue.o.cpu_notifier
-rw-rw-r-- 1 laijs laijs 478240 Sep 15 11:31 kernel/workqueue.o.hotcpu_notifier

$ size kernel/workqueue.o.cpu_notifier kernel/workqueue.o.hotcpu_notifier
   text	   data	    bss	    dec	    hex	filename
  18513	   2387	   1221	  22121	   5669	kernel/workqueue.o.cpu_notifier
  18082	   2355	   1221	  21658	   549a	kernel/workqueue.o.hotcpu_notifier

tj: Updated description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 942bb750a650..48becaba1c94 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3807,7 +3807,7 @@ static int __init init_workqueues(void)
 		     WORK_CPU_LAST);
 
 	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
-	cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
+	hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
 
 	/* initialize gcwqs */
 	for_each_gcwq_cpu(cpu) {
-- 
cgit v1.2.3


From 3aa62497594430ea522050b75c033f71f2c60ee6 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 18 Sep 2012 10:40:00 -0700
Subject: workqueue: fix possible stall on try_to_grab_pending() of a delayed
 work item

Currently, when try_to_grab_pending() grabs a delayed work item, it
leaves its linked work items alone on the delayed_works.  The linked
work items are always NO_COLOR and will cause future
cwq_activate_first_delayed() increase cwq->nr_active incorrectly, and
may cause the whole cwq to stall.  For example,

state: cwq->max_active = 1, cwq->nr_active = 1
       one work in cwq->pool, many in cwq->delayed_works.

step1: try_to_grab_pending() removes a work item from delayed_works
       but leaves its NO_COLOR linked work items on it.

step2: Later on, cwq_activate_first_delayed() activates the linked
       work item increasing ->nr_active.

step3: cwq->nr_active = 1, but all activated work items of the cwq are
       NO_COLOR.  When they finish, cwq->nr_active will not be
       decreased due to NO_COLOR, and no further work items will be
       activated from cwq->delayed_works. the cwq stalls.

Fix it by ensuring the target work item is activated before stealing
PENDING in try_to_grab_pending().  This ensures that all the linked
work items are activated without incorrectly bumping cwq->nr_active.

tj: Updated comment and description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@kernel.org
---
 kernel/workqueue.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 48becaba1c94..d2fe8e77ceb7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -977,10 +977,9 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
 		*nextp = n;
 }
 
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+static void cwq_activate_delayed_work(struct work_struct *work)
 {
-	struct work_struct *work = list_first_entry(&cwq->delayed_works,
-						    struct work_struct, entry);
+	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
 
 	trace_workqueue_activate_work(work);
 	move_linked_works(work, &cwq->pool->worklist, NULL);
@@ -988,6 +987,14 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 	cwq->nr_active++;
 }
 
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+	struct work_struct *work = list_first_entry(&cwq->delayed_works,
+						    struct work_struct, entry);
+
+	cwq_activate_delayed_work(work);
+}
+
 /**
  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
  * @cwq: cwq of interest
@@ -1106,6 +1113,18 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 		smp_rmb();
 		if (gcwq == get_work_gcwq(work)) {
 			debug_work_deactivate(work);
+
+			/*
+			 * A delayed work item cannot be grabbed directly
+			 * because it might have linked NO_COLOR work items
+			 * which, if left on the delayed_list, will confuse
+			 * cwq->nr_active management later on and cause
+			 * stall.  Make sure the work item is activated
+			 * before grabbing.
+			 */
+			if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
+				cwq_activate_delayed_work(work);
+
 			list_del_init(&work->entry);
 			cwq_dec_nr_in_flight(get_work_cwq(work),
 				get_work_color(work),
-- 
cgit v1.2.3


From b3f9f405a21a29c06c31fb2d6ab36ef9ba7c027b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 18 Sep 2012 10:40:00 -0700
Subject: workqueue: remove @delayed from cwq_dec_nr_in_flight()

@delayed is now always false for all callers, remove it.

tj: Updated description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d2fe8e77ceb7..3e324aae3c98 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -999,7 +999,6 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
  * @cwq: cwq of interest
  * @color: color of work which left the queue
- * @delayed: for a delayed work
  *
  * A work either has completed or is removed from pending queue,
  * decrement nr_in_flight of its cwq and handle workqueue flushing.
@@ -1007,8 +1006,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
  * CONTEXT:
  * spin_lock_irq(gcwq->lock).
  */
-static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
-				 bool delayed)
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
 {
 	/* ignore uncolored works */
 	if (color == WORK_NO_COLOR)
@@ -1016,13 +1014,11 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
 
 	cwq->nr_in_flight[color]--;
 
-	if (!delayed) {
-		cwq->nr_active--;
-		if (!list_empty(&cwq->delayed_works)) {
-			/* one down, submit a delayed one */
-			if (cwq->nr_active < cwq->max_active)
-				cwq_activate_first_delayed(cwq);
-		}
+	cwq->nr_active--;
+	if (!list_empty(&cwq->delayed_works)) {
+		/* one down, submit a delayed one */
+		if (cwq->nr_active < cwq->max_active)
+			cwq_activate_first_delayed(cwq);
 	}
 
 	/* is flush in progress and are we at the flushing tip? */
@@ -1127,8 +1123,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 
 			list_del_init(&work->entry);
 			cwq_dec_nr_in_flight(get_work_cwq(work),
-				get_work_color(work),
-				*work_data_bits(work) & WORK_STRUCT_DELAYED);
+				get_work_color(work));
 
 			spin_unlock(&gcwq->lock);
 			return 1;
@@ -2264,7 +2259,7 @@ __acquires(&gcwq->lock)
 	hlist_del_init(&worker->hentry);
 	worker->current_work = NULL;
 	worker->current_cwq = NULL;
-	cwq_dec_nr_in_flight(cwq, work_color, false);
+	cwq_dec_nr_in_flight(cwq, work_color);
 }
 
 /**
-- 
cgit v1.2.3


From 9f4bd4cddbb50d7617353102e10ce511c5ef6df2 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 19 Sep 2012 10:40:48 -0700
Subject: workqueue: introduce cwq_set_max_active() helper for
 thaw_workqueues()

Using a helper instead of open code makes thaw_workqueues() clearer.
The helper will also be used by the next patch.

tj: Slight update to comment and description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3e324aae3c98..b5d722548ffd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3366,6 +3366,26 @@ void destroy_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 
+/**
+ * cwq_set_max_active - adjust max_active of a cwq
+ * @cwq: target cpu_workqueue_struct
+ * @max_active: new max_active value.
+ *
+ * Set @cwq->max_active to @max_active and activate delayed works if
+ * increased.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
+{
+	cwq->max_active = max_active;
+
+	while (!list_empty(&cwq->delayed_works) &&
+	       cwq->nr_active < cwq->max_active)
+		cwq_activate_first_delayed(cwq);
+}
+
 /**
  * workqueue_set_max_active - adjust max_active of a workqueue
  * @wq: target workqueue
@@ -3792,11 +3812,7 @@ void thaw_workqueues(void)
 				continue;
 
 			/* restore max_active and repopulate worklist */
-			cwq->max_active = wq->saved_max_active;
-
-			while (!list_empty(&cwq->delayed_works) &&
-			       cwq->nr_active < cwq->max_active)
-				cwq_activate_first_delayed(cwq);
+			cwq_set_max_active(cwq, wq->saved_max_active);
 		}
 
 		for_each_worker_pool(pool, gcwq)
-- 
cgit v1.2.3


From 70369b117a8fc5ac18a635ced23ee49f8e722e7b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 19 Sep 2012 10:40:48 -0700
Subject: workqueue: use cwq_set_max_active() helper for
 workqueue_set_max_active()

workqueue_set_max_active() may increase ->max_active without
activating delayed works and may make the activation order differ from
the queueing order.  Both aren't strictly bugs but the resulting
behavior could be a bit odd.

To make things more consistent, use cwq_set_max_active() helper which
immediately makes use of the newly increased max_mactive if there are
delayed work items and also keeps the activation order.

tj: Slight update to description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b5d722548ffd..4f5c61f8b0e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3413,7 +3413,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 
 		if (!(wq->flags & WQ_FREEZABLE) ||
 		    !(gcwq->flags & GCWQ_FREEZING))
-			get_cwq(gcwq->cpu, wq)->max_active = max_active;
+			cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);
 
 		spin_unlock_irq(&gcwq->lock);
 	}
-- 
cgit v1.2.3


From 7c6e72e46c9ea4a88f3f8ba96edce9db4bd48726 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 20 Sep 2012 10:03:19 -0700
Subject: workqueue: remove spurious WARN_ON_ONCE(in_irq()) from
 try_to_grab_pending()

e0aecdd874 ("workqueue: use irqsafe timer for delayed_work") made
try_to_grab_pending() safe to use from irq context but forgot to
remove WARN_ON_ONCE(in_irq()).  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
---
 kernel/workqueue.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4f5c61f8b0e7..143fd8c751f4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1070,8 +1070,6 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 {
 	struct global_cwq *gcwq;
 
-	WARN_ON_ONCE(in_irq());
-
 	local_irq_save(*flags);
 
 	/* try to steal the timer if it exists */
-- 
cgit v1.2.3