From cf475ad28ac35cc9ba612d67158f29b73b38b05d Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 29 Apr 2008 01:00:16 -0700 Subject: cgroups: add an owner to the mm_struct Remove the mem_cgroup member from mm_struct and instead adds an owner. This approach was suggested by Paul Menage. The advantage of this approach is that, once the mm->owner is known, using the subsystem id, the cgroup can be determined. It also allows several control groups that are virtually grouped by mm_struct, to exist independent of the memory controller i.e., without adding mem_cgroup's for each controller, to mm_struct. A new config option CONFIG_MM_OWNER is added and the memory resource controller selects this config option. This patch also adds cgroup callbacks to notify subsystems when mm->owner changes. The mm_cgroup_changed callback is called with the task_lock() of the new task held and is called just prior to changing the mm->owner. I am indebted to Paul Menage for the several reviews of this patchset and helping me make it lighter and simpler. This patch was tested on a powerpc box, it was compiled with both the MM_OWNER config turned on and off. After the thread group leader exits, it's moved to init_css_state by cgroup_exit(), thus all future charges from runnings threads would be redirected to the init_css_set's subsystem. Signed-off-by: Balbir Singh Cc: Pavel Emelianov Cc: Hugh Dickins Cc: Sudhir Kumar Cc: YAMAMOTO Takashi Cc: Hirokazu Takahashi Cc: David Rientjes , Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Acked-by: Pekka Enberg Reviewed-by: Paul Menage Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 024d72b47a0c..1d02babdb2c7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2148,6 +2148,19 @@ static inline void migration_init(void) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +#ifdef CONFIG_MM_OWNER +extern void mm_update_next_owner(struct mm_struct *mm); +extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); +#else +static inline void mm_update_next_owner(struct mm_struct *mm) +{ +} + +static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +} +#endif /* CONFIG_MM_OWNER */ + #endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From e442055193e4584218006e616c9bdce0c5e9ae5c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2008 00:52:44 -0700 Subject: signals: re-assign CLD_CONTINUED notification from the sender to reciever Based on discussion with Jiri and Roland. In short: currently handle_stop_signal(SIGCONT, p) sends the notification to p->parent, with this patch p itself notifies its parent when it becomes running. handle_stop_signal(SIGCONT) has to drop ->siglock temporary in order to notify the parent with do_notify_parent_cldstop(). This leads to multiple problems: - as Jiri Kosina pointed out, the stopped task can resume without actually seeing SIGCONT which may have a handler. - we race with another sig_kernel_stop() signal which may come in that window. - we race with sig_fatal() signals which may set SIGNAL_GROUP_EXIT in that window. - we can't avoid taking tasklist_lock() while sending SIGCONT. With this patch handle_stop_signal() just sets the new SIGNAL_CLD_CONTINUED flag in p->signal->flags and returns. The notification is sent by the first task which returns from finish_stop() (there should be at least one) or any other signalled thread from get_signal_to_deliver(). This is a user-visible change. Say, currently kill(SIGCONT, stopped_child) can't return without seeing SIGCHLD, with this patch SIGCHLD can be delayed unpredictably. Another difference is that if the child is ptraced by another process, CLD_CONTINUED may be delivered to ->real_parent after ptrace_detach() while currently it always goes to the tracer which doesn't actually need this notification. Hopefully not a problem. The patch asks for the futher obvious cleanups, I'll send them separately. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1d02babdb2c7..ef5615270342 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -554,6 +554,12 @@ struct signal_struct { #define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */ #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ +/* + * Pending notifications to parent. + */ +#define SIGNAL_CLD_STOPPED 0x00000010 +#define SIGNAL_CLD_CONTINUED 0x00000020 +#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) -- cgit v1.2.3 From ac5c215383f43a106ba4ef298126bf78c126f5e9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2008 00:52:57 -0700 Subject: signals: join send_sigqueue() with send_group_sigqueue() We export send_sigqueue() and send_group_sigqueue() for the only user, posix_timer_event(). This is a bit silly, because both are just trivial helpers on top of do_send_sigqueue() and because the we pass the unused .si_signo parameter. Kill them both, rename do_send_sigqueue() to send_sigqueue(), and export it. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index ef5615270342..0917b3df12d5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1751,8 +1751,7 @@ extern void zap_other_threads(struct task_struct *p); extern int kill_proc(pid_t, int, int); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(int, struct sigqueue *, struct task_struct *); -extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); +extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); -- cgit v1.2.3 From fae5fa44f1fd079ffbed8e0add929dd7bbd1347f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2008 00:53:03 -0700 Subject: signals: fix /sbin/init protection from unwanted signals The global init has a lot of long standing problems with the unhandled fatal signals. - The "is_global_init(current)" check in get_signal_to_deliver() protects only the main thread. Sub-thread can dequee the fatal signal and shutdown the whole thread group except the main thread. If it dequeues SIGSTOP /sbin/init will be stopped, this is not right too. Note that we can't use is_global_init(->group_leader), this breaks exec and this can't solve other problems we have. - Even if afterwards ignored, the fatal signals sets SIGNAL_GROUP_EXIT on delivery. This breaks exec, has other bad implications, and this is just wrong. Introduce the new SIGNAL_UNKILLABLE flag to fix these problems. It also helps to solve some other problems addressed by the subsequent patches. Currently we use this flag for the global init only, but it could also be used by kthreads and (perhaps) by the sub-namespace inits. Signed-off-by: Oleg Nesterov Acked-by: "Eric W. Biederman" Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0917b3df12d5..fe970cdca83c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -561,6 +561,8 @@ struct signal_struct { #define SIGNAL_CLD_CONTINUED 0x00000020 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) +#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ + /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { -- cgit v1.2.3 From f3de272b821accbc8387211977c2de4f38468d05 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 30 Apr 2008 00:53:09 -0700 Subject: signals: use HAVE_SET_RESTORE_SIGMASK Change all the #ifdef TIF_RESTORE_SIGMASK conditionals in non-arch code to #ifdef HAVE_SET_RESTORE_SIGMASK. If arch code defines it first, the generic set_restore_sigmask() using TIF_RESTORE_SIGMASK is not defined. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index fe970cdca83c..86e60796db62 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1175,7 +1175,7 @@ struct task_struct { struct sighand_struct *sighand; sigset_t blocked, real_blocked; - sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */ + sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ struct sigpending pending; unsigned long sas_ss_sp; -- cgit v1.2.3 From 5cd204550b1a006f2b0c986b0e0f53220ebfd391 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 30 Apr 2008 00:54:24 -0700 Subject: Deprecate find_task_by_pid() There are some places that are known to operate on tasks' global pids only: * the rest_init() call (called on boot) * the kgdb's getthread * the create_kthread() (since the kthread is run in init ns) So use the find_task_by_pid_ns(..., &init_pid_ns) there and schedule the find_task_by_pid for removal. [sukadev@us.ibm.com: Fix warning in kernel/pid.c] Signed-off-by: Pavel Emelyanov Cc: "Eric W. Biederman" Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 86e60796db62..03c238088aee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1677,7 +1677,10 @@ extern struct pid_namespace init_pid_ns; extern struct task_struct *find_task_by_pid_type_ns(int type, int pid, struct pid_namespace *ns); -extern struct task_struct *find_task_by_pid(pid_t nr); +static inline struct task_struct *__deprecated find_task_by_pid(pid_t nr) +{ + return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns); +} extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); -- cgit v1.2.3 From 8ae121ac8666b0421aa20fd80d4597ec66fa54bc Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 23 Apr 2008 07:13:29 -0400 Subject: sched: fix RT task-wakeup logic Dmitry Adamushko pointed out a logic error in task_wake_up_rt() where we will always evaluate to "true". You can find the thread here: http://lkml.org/lkml/2008/4/22/296 In reality, we only want to try to push tasks away when a wake up request is not going to preempt the current task. So lets fix it. Note: We introduce test_tsk_need_resched() instead of open-coding the flag check so that the merge-conflict with -rt should help remind us that we may need to support NEEDS_RESCHED_DELAYED in the future, too. Signed-off-by: Gregory Haskins CC: Dmitry Adamushko CC: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 03c238088aee..698b5a4d25a7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1977,6 +1977,11 @@ static inline void clear_tsk_need_resched(struct task_struct *tsk) clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } +static inline int test_tsk_need_resched(struct task_struct *tsk) +{ + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); +} + static inline int signal_pending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); @@ -1991,7 +1996,7 @@ static inline int fatal_signal_pending(struct task_struct *p) static inline int need_resched(void) { - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); + return unlikely(test_tsk_need_resched(current)); } /* -- cgit v1.2.3 From 690229a0912ca2fef8b542fe4d8b73acfcdc6e24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 23 Apr 2008 09:31:35 +0200 Subject: sched: make clock sync tunable by architecture code make time_sync_thresh tunable to architecture code. Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 698b5a4d25a7..54c9ca26b7d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -158,6 +158,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) } #endif +extern unsigned long long time_sync_thresh; + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). -- cgit v1.2.3 From 3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 3 May 2008 18:29:28 +0200 Subject: sched: add optional support for CONFIG_HAVE_UNSTABLE_SCHED_CLOCK this replaces the rq->clock stuff (and possibly cpu_clock()). - architectures that have an 'imperfect' hardware clock can set CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - the 'jiffie' window might be superfulous when we update tick_gtod before the __update_sched_clock() call in sched_clock_tick() - cpu_clock() might be implemented as: sched_clock_cpu(smp_processor_id()) if the accuracy proves good enough - how far can TSC drift in a single jiffie when considering the filtering and idle hooks? [ mingo@elte.hu: various fixes and cleanups ] Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 54c9ca26b7d8..0c35b0343a76 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1553,6 +1553,35 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) extern unsigned long long sched_clock(void); +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline void sched_clock_init(void) +{ +} + +static inline u64 sched_clock_cpu(int cpu) +{ + return sched_clock(); +} + +static inline void sched_clock_tick(void) +{ +} + +static inline void sched_clock_idle_sleep_event(void) +{ +} + +static inline void sched_clock_idle_wakeup_event(u64 delta_ns) +{ +} +#else +extern void sched_clock_init(void); +extern u64 sched_clock_cpu(int cpu); +extern void sched_clock_tick(void); +extern void sched_clock_idle_sleep_event(void); +extern void sched_clock_idle_wakeup_event(u64 delta_ns); +#endif + /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): -- cgit v1.2.3