From 1d615482547584b9a8bb6316a58fed6ce90dd9ff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 14:54:03 +0100 Subject: sched: Convert pi_lock to raw_spinlock Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 294eb2f80144..41a9ea322dce 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1409,7 +1409,7 @@ struct task_struct { #endif /* Protection of the PI data structures: */ - spinlock_t pi_lock; + raw_spinlock_t pi_lock; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ -- cgit v1.2.3 From e4c570c4cb7a95dbfafa3d016d2739bf3fdfe319 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 14 Dec 2009 18:00:26 -0800 Subject: task_struct: make journal_info conditional journal_info in task_struct is used in journaling file system only. So introduce CONFIG_FS_JOURNAL_INFO and make it conditional. Signed-off-by: Hiroshi Shimamoto Cc: Chris Mason Cc: "Theodore Ts'o" Cc: Steven Whitehouse Cc: KONISHI Ryusuke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 294eb2f80144..7d388494f45d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1446,8 +1446,10 @@ struct task_struct { gfp_t lockdep_reclaim_gfp; #endif +#ifdef CONFIG_FS_JOURNAL_INFO /* journalling filesystem info */ void *journal_info; +#endif /* stacked block device info */ struct bio *bio_list, **bio_tail; -- cgit v1.2.3 From 569b846df54ffb2827b83ce3244c5f032394cba4 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 15 Dec 2009 16:47:03 -0800 Subject: memcg: coalesce uncharge during unmap/truncate In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c858f38e81a..f4c145410a8d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1544,6 +1544,14 @@ struct task_struct { unsigned long trace_recursion; #endif /* CONFIG_TRACING */ unsigned long stack_start; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ + struct memcg_batch_info { + int do_batch; /* incremented when batch uncharge started */ + struct mem_cgroup *memcg; /* target memcg of uncharge */ + unsigned long bytes; /* uncharged usage */ + unsigned long memsw_bytes; /* uncharged mem+swap usage */ + } memcg_batch; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ -- cgit v1.2.3 From 614c517d7c00af1b26ded20646b329397d6f51a1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:22 -0800 Subject: signals: SEND_SIG_NOINFO should be considered as SI_FROMUSER() No changes in compiled code. The patch adds the new helper, si_fromuser() and changes check_kill_permission() to use this helper. The real effect of this patch is that from now we "officially" consider SEND_SIG_NOINFO signal as "from user-space" signals. This is already true if we look at the code which uses SEND_SIG_NOINFO, except __send_signal() has another opinion - see the next patch. The naming of these special SEND_SIG_XXX siginfo's is really bad imho. From __send_signal()'s pov they mean SEND_SIG_NOINFO from user SEND_SIG_PRIV from kernel SEND_SIG_FORCED no info Signed-off-by: Oleg Nesterov Cc: Roland McGrath Reviewed-by: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index f4c145410a8d..57b3516f055b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2102,11 +2102,6 @@ static inline int kill_cad_pid(int sig, int priv) #define SEND_SIG_PRIV ((struct siginfo *) 1) #define SEND_SIG_FORCED ((struct siginfo *) 2) -static inline int is_si_special(const struct siginfo *info) -{ - return info <= SEND_SIG_FORCED; -} - /* * True if we are on the alternate signal stack. */ -- cgit v1.2.3 From ad09750b51150ca87531b8790a379214a974c167 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:25 -0800 Subject: signals: kill force_sig_specific() Kill force_sig_specific(), this trivial wrapper has no callers. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 57b3516f055b..244c287a5ac1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2083,7 +2083,6 @@ extern int kill_proc_info(int, struct siginfo *, pid_t); extern int do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int, struct task_struct *); -extern void force_sig_specific(int, struct task_struct *); extern int send_sig(int, struct task_struct *, int); extern void zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); -- cgit v1.2.3 From efbbd05a595343a413964ad85a2ad359b7b7efbd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:40 +0100 Subject: sched: Add pre and post wakeup hooks As will be apparent in the next patch, we need a pre wakeup hook for sched_fair task migration, hence rename the post wakeup hook and one pre wakeup. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.114746117@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c858f38e81a..2c9fa1ccebff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1091,7 +1091,8 @@ struct sched_class { enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); - void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); + void (*task_waking) (struct rq *this_rq, struct task_struct *task); + void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, const struct cpumask *newmask); -- cgit v1.2.3 From 88ec22d3edb72b261f8628226cd543589a6d5e1b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:41 +0100 Subject: sched: Remove the cfs_rq dependency from set_task_cpu() In order to remove the cfs_rq dependency from set_task_cpu() we need to ensure the task is cfs_rq invariant for all callsites. The simple approach is to substract cfs_rq->min_vruntime from se->vruntime on dequeue, and add cfs_rq->min_vruntime on enqueue. However, this has the downside of breaking FAIR_SLEEPERS since we loose the old vruntime as we only maintain the relative position. To solve this, we observe that we only migrate runnable tasks, we do this using deactivate_task(.sleep=0) and activate_task(.wakeup=0), therefore we can restrain the min_vruntime invariance to that state. The only other case is wakeup balancing, since we want to maintain the old vruntime we cannot make it relative on dequeue, but since we don't migrate inactive tasks, we can do so right before we activate it again. This is where we need the new pre-wakeup hook, we need to call this while still holding the old rq->lock. We could fold it into ->select_task_rq(), but since that has multiple callsites and would obfuscate the locking requirements, that seems like a fudge. This leaves the fork() case, simply make sure that ->task_fork() leaves the ->vruntime in a relative state. This covers all cases where set_task_cpu() gets called, and ensures it sees a relative vruntime. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.191697025@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c9fa1ccebff..973b2b89f86d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1116,7 +1116,7 @@ struct sched_class { struct task_struct *task); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*moved_group) (struct task_struct *p); + void (*moved_group) (struct task_struct *p, int on_rq); #endif }; -- cgit v1.2.3 From a4636818f8e0991f32d9528f39cf4f3d6a7d30a3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 17 Dec 2009 11:43:29 -0600 Subject: cpumask: rename tsk_cpumask to tsk_cpus_allowed Noone uses this wrapper yet, and Ingo asked that it be kept consistent with current task_struct usage. (One user crept in via linux-next: fixed) Signed-off-by: Rusty Russell Cc: Tejun Heo --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 244c287a5ac1..4d7adb282bdd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1555,7 +1555,7 @@ struct task_struct { }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ -#define tsk_cpumask(tsk) (&(tsk)->cpus_allowed) +#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT -- cgit v1.2.3 From 733421516b42c44b9e21f1793c430cc801ef8324 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:27 +0100 Subject: sched: Move TASK_STATE_TO_CHAR_STR near the TASK_state bits So that we don't keep forgetting about it. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121829.815779372@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 973b2b89f86d..c28ed1b1d7c2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -193,6 +193,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_WAKEKILL 128 #define TASK_WAKING 256 +#define TASK_STATE_TO_CHAR_STR "RSDTtZX" + /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) @@ -2595,8 +2597,6 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) } #endif /* CONFIG_MM_OWNER */ -#define TASK_STATE_TO_CHAR_STR "RSDTtZX" - #endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From 44d90df6b757c59651ddd55f1a84f28132b50d29 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:28 +0100 Subject: sched: Add missing state chars to TASK_STATE_TO_CHAR_STR We grew 3 new task states since the last time someone touched it. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121829.892737686@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index c28ed1b1d7c2..94858df38a87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -193,7 +193,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_WAKEKILL 128 #define TASK_WAKING 256 -#define TASK_STATE_TO_CHAR_STR "RSDTtZX" +#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) -- cgit v1.2.3 From e1781538cf5c870ab696e9b8f0a5c498d3900f2f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:30 +0100 Subject: sched: Assert task state bits at build time Since everybody is lazy and prone to forgetting things, make the compiler help us a bit. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121830.060186433@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 94858df38a87..37543876ddf5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -192,9 +192,13 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_DEAD 64 #define TASK_WAKEKILL 128 #define TASK_WAKING 256 +#define TASK_STATE_MAX 512 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" +extern char ___assert_task_state[1 - 2*!!( + sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; + /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) -- cgit v1.2.3 From b6e3224fb20954f155e41ec5709b2ab70b50ae2d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2009 13:23:24 -0800 Subject: Revert "task_struct: make journal_info conditional" This reverts commit e4c570c4cb7a95dbfafa3d016d2739bf3fdfe319, as requested by Alexey: "I think I gave a good enough arguments to not merge it. To iterate: * patch makes impossible to start using ext3 on EXT3_FS=n kernels without reboot. * this is done only for one pointer on task_struct" None of config options which define task_struct are tristate directly or effectively." Requested-by: Alexey Dobriyan Acked-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 244c287a5ac1..211ed32befbd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1446,10 +1446,8 @@ struct task_struct { gfp_t lockdep_reclaim_gfp; #endif -#ifdef CONFIG_FS_JOURNAL_INFO /* journalling filesystem info */ void *journal_info; -#endif /* stacked block device info */ struct bio *bio_list, **bio_tail; -- cgit v1.2.3