8 files changed, 76 insertions, 59 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1a649f2bb9bb..4ea6f0dc2fc5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -816,6 +816,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	struct cpuset trialcs;
 	int retval, cpus_unchanged;
 
+	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+	if (cs == &top_cpuset)
+		return -EACCES;
+
 	trialcs = *cs;
 	retval = cpulist_parse(buf, trialcs.cpus_allowed);
 	if (retval < 0)
@@ -2033,6 +2037,33 @@ out:
 	return err;
 }
 
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period.  This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This handles CPU hotplug (cpuhp) events.  If someday Memory
+ * Nodes can be hotplugged (dynamically changing node_online_map)
+ * then we should handle that too, perhaps in a similar way.
+ */
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int cpuset_handle_cpuhp(struct notifier_block *nb,
+				unsigned long phase, void *cpu)
+{
+	mutex_lock(&manage_mutex);
+	mutex_lock(&callback_mutex);
+
+	top_cpuset.cpus_allowed = cpu_online_map;
+
+	mutex_unlock(&callback_mutex);
+	mutex_unlock(&manage_mutex);
+
+	return 0;
+}
+#endif
+
 /**
  * cpuset_init_smp - initialize cpus_allowed
  *
@@ -2043,6 +2074,8 @@ void __init cpuset_init_smp(void)
 {
 	top_cpuset.cpus_allowed = cpu_online_map;
 	top_cpuset.mems_allowed = node_online_map;
+
+	hotcpu_notifier(cpuset_handle_cpuhp, 0);
 }
 
 /**
@@ -2387,7 +2420,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
 {
 	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
-	int overlap = 0;		/* do cpusets overlap? */
+	int overlap = 1;		/* do cpusets overlap? */
 
 	task_lock(current);
 	if (current->flags & PF_EXITING) {
diff --git a/kernel/futex.c b/kernel/futex.c
index c2b2e0b83abf..b9b8aea5389e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -297,7 +297,7 @@ static int futex_handle_fault(unsigned long address, int attempt)
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
 
-	if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
+	if (attempt > 2 || !(vma = find_vma(mm, address)) ||
 	    vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
 		return -EFAULT;
 
@@ -397,7 +397,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 		p = NULL;
 		goto out_unlock;
 	}
-	if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+	if (p->exit_state != 0) {
 		p = NULL;
 		goto out_unlock;
 	}
@@ -747,8 +747,10 @@ retry:
 		 */
 		if (attempt++) {
 			if (futex_handle_fault((unsigned long)uaddr2,
-					       attempt))
+						attempt)) {
+				ret = -EFAULT;
 				goto out;
+			}
 			goto retry;
 		}
 
@@ -1322,9 +1324,10 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
 	 * still holding the mmap_sem.
 	 */
 	if (attempt++) {
-		if (futex_handle_fault((unsigned long)uaddr, attempt))
+		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+			ret = -EFAULT;
 			goto out_unlock_release_sem;
-
+		}
 		goto retry_locked;
 	}
 
@@ -1506,9 +1509,10 @@ pi_faulted:
 	 * still holding the mmap_sem.
 	 */
 	if (attempt++) {
-		if (futex_handle_fault((unsigned long)uaddr, attempt))
+		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+			ret = -EFAULT;
 			goto out_unlock;
-
+		}
 		goto retry_locked;
 	}
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index be989efc7856..21c38a7e666b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -187,7 +187,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	struct hrtimer_base *new_base;
 
-	new_base = &__get_cpu_var(hrtimer_bases[base->index]);
+	new_base = &__get_cpu_var(hrtimer_bases)[base->index];
 
 	if (base != new_base) {
 		/*
diff --git a/kernel/panic.c b/kernel/panic.c
index d8a0bca21233..9b8dcfd1ca93 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/nmi.h>
 #include <linux/kexec.h>
+#include <linux/debug_locks.h>
 
 int panic_on_oops;
 int tainted;
diff --git a/kernel/sched.c b/kernel/sched.c
index a2be2d055299..a234fbee1238 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4162,10 +4162,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 		read_unlock_irq(&tasklist_lock);
 		return -ESRCH;
 	}
-	get_task_struct(p);
-	read_unlock_irq(&tasklist_lock);
 	retval = sched_setscheduler(p, policy, &lparam);
-	put_task_struct(p);
+	read_unlock_irq(&tasklist_lock);
 
 	return retval;
 }
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index dcfb5d731466..51cacd111dbd 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -111,7 +111,6 @@ static int stop_machine(void)
 	/* If some failed, kill them all. */
 	if (ret < 0) {
 		stopmachine_set_state(STOPMACHINE_EXIT);
-		up(&stopmachine_mutex);
 		return ret;
 	}
 
diff --git a/kernel/timer.c b/kernel/timer.c
index b650f04888ed..1d7dd6267c2d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1324,46 +1324,19 @@ asmlinkage long sys_getpid(void)
 }
 
 /*
- * Accessing ->group_leader->real_parent is not SMP-safe, it could
- * change from under us. However, rather than getting any lock
- * we can use an optimistic algorithm: get the parent
- * pid, and go back and check that the parent is still
- * the same. If it has changed (which is extremely unlikely
- * indeed), we just try again..
- *
- * NOTE! This depends on the fact that even if we _do_
- * get an old value of "parent", we can happily dereference
- * the pointer (it was and remains a dereferencable kernel pointer
- * no matter what): we just can't necessarily trust the result
- * until we know that the parent pointer is valid.
- *
- * NOTE2: ->group_leader never changes from under us.
+ * Accessing ->real_parent is not SMP-safe, it could
+ * change from under us. However, we can use a stale
+ * value of ->real_parent under rcu_read_lock(), see
+ * release_task()->call_rcu(delayed_put_task_struct).
  */
 asmlinkage long sys_getppid(void)
 {
 	int pid;
-	struct task_struct *me = current;
-	struct task_struct *parent;
 
-	parent = me->group_leader->real_parent;
-	for (;;) {
-		pid = parent->tgid;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-{
-		struct task_struct *old = parent;
+	rcu_read_lock();
+	pid = rcu_dereference(current->real_parent)->tgid;
+	rcu_read_unlock();
 
-		/*
-		 * Make sure we read the pid before re-reading the
-		 * parent pointer:
-		 */
-		smp_rmb();
-		parent = me->group_leader->real_parent;
-		if (old != parent)
-			continue;
-}
-#endif
-		break;
-	}
 	return pid;
 }
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 448e8f7b342d..835fe28b87a8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,7 +68,7 @@ struct workqueue_struct {
 
 /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
    threads to each one as cpus come/go. */
-static DEFINE_SPINLOCK(workqueue_lock);
+static DEFINE_MUTEX(workqueue_mutex);
 static LIST_HEAD(workqueues);
 
 static int singlethread_cpu;
@@ -320,10 +320,10 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 	} else {
 		int cpu;
 
-		lock_cpu_hotplug();
+		mutex_lock(&workqueue_mutex);
 		for_each_online_cpu(cpu)
 			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
-		unlock_cpu_hotplug();
+		mutex_unlock(&workqueue_mutex);
 	}
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -371,8 +371,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	}
 
 	wq->name = name;
-	/* We don't need the distraction of CPUs appearing and vanishing. */
-	lock_cpu_hotplug();
+	mutex_lock(&workqueue_mutex);
 	if (singlethread) {
 		INIT_LIST_HEAD(&wq->list);
 		p = create_workqueue_thread(wq, singlethread_cpu);
@@ -381,9 +380,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 		else
 			wake_up_process(p);
 	} else {
-		spin_lock(&workqueue_lock);
 		list_add(&wq->list, &workqueues);
-		spin_unlock(&workqueue_lock);
 		for_each_online_cpu(cpu) {
 			p = create_workqueue_thread(wq, cpu);
 			if (p) {
@@ -393,7 +390,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 				destroy = 1;
 		}
 	}
-	unlock_cpu_hotplug();
+	mutex_unlock(&workqueue_mutex);
 
 	/*
 	 * Was there any error during startup? If yes then clean up:
@@ -434,17 +431,15 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	flush_workqueue(wq);
 
 	/* We don't need the distraction of CPUs appearing and vanishing. */
-	lock_cpu_hotplug();
+	mutex_lock(&workqueue_mutex);
 	if (is_single_threaded(wq))
 		cleanup_workqueue_thread(wq, singlethread_cpu);
 	else {
 		for_each_online_cpu(cpu)
 			cleanup_workqueue_thread(wq, cpu);
-		spin_lock(&workqueue_lock);
 		list_del(&wq->list);
-		spin_unlock(&workqueue_lock);
 	}
-	unlock_cpu_hotplug();
+	mutex_unlock(&workqueue_mutex);
 	free_percpu(wq->cpu_wq);
 	kfree(wq);
 }
@@ -515,11 +510,13 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info)
 	if (!works)
 		return -ENOMEM;
 
+	mutex_lock(&workqueue_mutex);
 	for_each_online_cpu(cpu) {
 		INIT_WORK(per_cpu_ptr(works, cpu), func, info);
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
 				per_cpu_ptr(works, cpu));
 	}
+	mutex_unlock(&workqueue_mutex);
 	flush_workqueue(keventd_wq);
 	free_percpu(works);
 	return 0;
@@ -635,6 +632,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+		mutex_lock(&workqueue_mutex);
 		/* Create a new workqueue thread for it. */
 		list_for_each_entry(wq, &workqueues, list) {
 			if (!create_workqueue_thread(wq, hotcpu)) {
@@ -653,6 +651,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			kthread_bind(cwq->thread, hotcpu);
 			wake_up_process(cwq->thread);
 		}
+		mutex_unlock(&workqueue_mutex);
 		break;
 
 	case CPU_UP_CANCELED:
@@ -664,6 +663,15 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 				     any_online_cpu(cpu_online_map));
 			cleanup_workqueue_thread(wq, hotcpu);
 		}
+		mutex_unlock(&workqueue_mutex);
+		break;
+
+	case CPU_DOWN_PREPARE:
+		mutex_lock(&workqueue_mutex);
+		break;
+
+	case CPU_DOWN_FAILED:
+		mutex_unlock(&workqueue_mutex);
 		break;
 
 	case CPU_DEAD:
@@ -671,6 +679,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			cleanup_workqueue_thread(wq, hotcpu);
 		list_for_each_entry(wq, &workqueues, list)
 			take_over_work(wq, hotcpu);
+		mutex_unlock(&workqueue_mutex);
 		break;
 	}