From cd1bb94b4a0531e8211a3774f17de831f8285f76 Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2009 12:15:34 +0530
Subject: timers: /proc/sys sysctl hook to enable timer migration

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]:

This patch creates the /proc/sys sysctl interface at
/proc/sys/kernel/timer_migration

Timer migration is enabled by default.

To disable timer migration, when CONFIG_SCHED_DEBUG = y,

echo 0 > /proc/sys/kernel/timer_migration

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049c..618504010400 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1766,6 +1766,7 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_timer_migration;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
-- 
cgit v1.2.3


From eea08f32adb3f97553d49a4f79a119833036000a Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2009 12:16:41 +0530
Subject: timers: Logic to move non pinned timers

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-16 12:11:36]:

This patch migrates all non pinned timers and hrtimers to the current
idle load balancer, from all the idle CPUs. Timers firing on busy CPUs
are not migrated.

While migrating hrtimers, care should be taken to check if migrating
a hrtimer would result in a latency or not. So we compare the expiry of the
hrtimer with the next timer interrupt on the target cpu and migrate the
hrtimer only if it expires *after* the next interrupt on the target cpu.
So, added a clockevents_get_next_event() helper function to return the
next_event on the target cpu's clock_event_device.

[ tglx: cleanups and simplifications ]

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 618504010400..311dec123974 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -257,6 +257,7 @@ extern void task_rq_unlock_wait(struct task_struct *p);
 extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
+extern int get_nohz_load_balancer(void);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
@@ -1772,6 +1773,17 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return sysctl_timer_migration;
+}
+#else
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return 1;
+}
+#endif
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
-- 
cgit v1.2.3


From 690cc3ffe33ac4a2857583c22d4c6244ae11684d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 13 May 2009 16:55:10 +0000
Subject: syscall: Implement a convinience function restart_syscall

Currently when we have a signal pending we have the functionality
to restart that the current system call.  There are other cases
such as nasty lock ordering issues where it makes sense to have
a simple fix that uses try lock and restarts the system call.
Buying time to figure out how to rework the locking strategy.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sched.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049c..d853f6bb0baf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2178,6 +2178,12 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
 }
 
+static inline int restart_syscall(void)
+{
+	set_tsk_thread_flag(current, TIF_SIGPENDING);
+	return -ERESTARTNOINTR;
+}
+
 static inline int signal_pending(struct task_struct *p)
 {
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
-- 
cgit v1.2.3


From 3959214f971417f4162926ac52ad4cd042958caa Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 24 Mar 2009 15:43:30 +0100
Subject: sched: delayed cleanup of user_struct

During bootup performance tracing we see repeated occurrences of
/sys/kernel/uid/* events for the same uid, leading to a,
in this case, rather pointless userspace processing for the
same uid over and over.

This is usually caused by tools which change their uid to "nobody",
to run without privileges to read data supplied by untrusted users.

This change delays the execution of the (already existing) scheduled
work, to cleanup the uid after one second, so the allocated and announced
uid can possibly be re-used by another process.

This is the current behavior, where almost every invocation of a
binary, which changes the uid, creates two events:
  $ read START < /sys/kernel/uevent_seqnum; \
  for i in `seq 100`; do su --shell=/bin/true bin; done; \
  read END < /sys/kernel/uevent_seqnum; \
  echo $(($END - $START))
  178

With the delayed cleanup, we get only two events, and userspace finishes
a bit faster too:
  $ read START < /sys/kernel/uevent_seqnum; \
  for i in `seq 100`; do su --shell=/bin/true bin; done; \
  read END < /sys/kernel/uevent_seqnum; \
  echo $(($END - $START))
  1

Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c900aa530070..7531b1c28201 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -674,7 +674,7 @@ struct user_struct {
 	struct task_group *tg;
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;
-	struct work_struct work;
+	struct delayed_work work;
 #endif
 #endif
 
-- 
cgit v1.2.3


From 58568d2a8215cb6f55caf2332017d7bdff954e1c Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 16 Jun 2009 15:31:49 -0700
Subject: cpuset,mm: update tasks' mems_allowed in time

Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.

In order to update tasks' mems_allowed in time, we must modify the code of
memory policy.  Because the memory policy is applied in the process's
context originally.  After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.

But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression.  But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set.  In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.

[lee.schermerhorn@hp.com:
  The rework of mpol_new() to extract the adjusting of the node mask to
  apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
  with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
  allocation.  Fix this by adding the check for MPOL_PREFERRED and empty
  node mask to mpol_new_mpolicy().

  Remove the now unneeded 'nodes = NULL' from mpol_new().

  Note that mpol_new_mempolicy() is always called with a non-NULL
  'nodes' parameter now that it has been removed from mpol_new().
  Therefore, we don't need to test nodes for NULL before testing it for
  'empty'.  However, just to be extra paranoid, add a VM_BUG_ON() to
  verify this assumption.]
[lee.schermerhorn@hp.com:

  I don't think the function name 'mpol_new_mempolicy' is descriptive
  enough to differentiate it from mpol_new().

  This function applies cpuset set context, usually constraining nodes
  to those allowed by the cpuset.  However, when the 'RELATIVE_NODES flag
  is set, it also translates the nodes.  So I settled on
  'mpol_set_nodemask()', because the comment block for mpol_new() mentions
  that we need to call this function to "set nodes".

  Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c900aa530070..1048bf50540a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1318,7 +1318,8 @@ struct task_struct {
 /* Thread group tracking */
    	u32 parent_exec_id;
    	u32 self_exec_id;
-/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
+/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
+ * mempolicy */
 	spinlock_t alloc_lock;
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -1386,8 +1387,7 @@ struct task_struct {
 	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
-	nodemask_t mems_allowed;
-	int cpuset_mems_generation;
+	nodemask_t mems_allowed;	/* Protected by alloc_lock */
 	int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
@@ -1410,7 +1410,7 @@ struct task_struct {
 	struct list_head perf_counter_list;
 #endif
 #ifdef CONFIG_NUMA
-	struct mempolicy *mempolicy;
+	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
 	short il_next;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
-- 
cgit v1.2.3


From 2ff05b2b4eac2e63d345fc731ea151a060247f53 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 16 Jun 2009 15:32:56 -0700
Subject: oom: move oom_adj value from task_struct to mm_struct

The per-task oom_adj value is a characteristic of its mm more than the
task itself since it's not possible to oom kill any thread that shares the
mm.  If a task were to be killed while attached to an mm that could not be
freed because another thread were set to OOM_DISABLE, it would have
needlessly been terminated since there is no potential for future memory
freeing.

This patch moves oomkilladj (now more appropriately named oom_adj) from
struct task_struct to struct mm_struct.  This requires task_lock() on a
task to check its oom_adj value to protect against exec, but it's already
necessary to take the lock when dereferencing the mm to find the total VM
size for the badness heuristic.

This fixes a livelock if the oom killer chooses a task and another thread
sharing the same memory has an oom_adj value of OOM_DISABLE.  This occurs
because oom_kill_task() repeatedly returns 1 and refuses to kill the
chosen task while select_bad_process() will repeatedly choose the same
task during the next retry.

Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in
oom_kill_task() to check for threads sharing the same memory will be
removed in the next patch in this series where it will no longer be
necessary.

Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since
these threads are immune from oom killing already.  They simply report an
oom_adj value of OOM_DISABLE.

Cc: Nick Piggin <npiggin@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1048bf50540a..1bc6fae0c135 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1178,7 +1178,6 @@ struct task_struct {
 	 * a short time
 	 */
 	unsigned char fpu_counter;
-	s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
-- 
cgit v1.2.3