From 14a40ffccd6163bbcd1d6f32b28a88ffe6149fc6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Mar 2013 13:45:20 -0700 Subject: sched: replace PF_THREAD_BOUND with PF_NO_SETAFFINITY PF_THREAD_BOUND was originally used to mark kernel threads which were bound to a specific CPU using kthread_bind() and a task with the flag set allows cpus_allowed modifications only to itself. Workqueue is currently abusing it to prevent userland from meddling with cpus_allowed of workqueue workers. What we need is a flag to prevent userland from messing with cpus_allowed of certain kernel tasks. In kernel, anyone can (incorrectly) squash the flag, and, for worker-type usages, restricting cpus_allowed modification to the task itself doesn't provide meaningful extra proection as other tasks can inject work items to the task anyway. This patch replaces PF_THREAD_BOUND with PF_NO_SETAFFINITY. sched_setaffinity() checks the flag and return -EINVAL if set. set_cpus_allowed_ptr() is no longer affected by the flag. This will allow simplifying workqueue worker CPU affinity management. Signed-off-by: Tejun Heo Acked-by: Ingo Molnar Reviewed-by: Lai Jiangshan Cc: Peter Zijlstra Cc: Thomas Gleixner --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..e5c64f7b8c1d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1793,7 +1793,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ -#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ +#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ -- cgit v1.2.3 From ee761f629d598579594d7e1eb8c552f3c5f71e4d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Mar 2013 22:49:32 +0100 Subject: arch: Consolidate tsk_is_polling() Move it to a common place. Preparatory patch for implementing set/clear for the idle need_resched poll implementation. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Rusty Russell Cc: Paul McKenney Cc: Peter Zijlstra Reviewed-by: Cc: Srivatsa S. Bhat Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130321215233.446034505@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..6709a5813f27 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2621,6 +2621,26 @@ static inline int spin_needbreak(spinlock_t *lock) #endif } +/* + * Idle thread specific functions to determine the need_resched + * polling state. We have two versions, one based on TS_POLLING in + * thread_info.status and one based on TIF_POLLING_NRFLAG in + * thread_info.flags + */ +#ifdef TS_POLLING +static inline int tsk_is_polling(struct task_struct *p) +{ + return task_thread_info(p)->status & TS_POLLING; +} +#elif defined(TIF_POLLING_NRFLAG) +static inline int tsk_is_polling(struct task_struct *p) +{ + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); +} +#else +static inline int tsk_is_polling(struct task_struct *p) { return 0; } +#endif + /* * Thread group CPU time accounting. */ -- cgit v1.2.3 From 3a98f871ecaf44806e188184332c3fec27c8f08c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Mar 2013 22:49:33 +0100 Subject: idle: Implement set/clr functions for need_resched poll Implement set/clear functions for the idle need_resched poll implementation. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Rusty Russell Cc: Paul McKenney Cc: Peter Zijlstra Reviewed-by: Cc: Srivatsa S. Bhat Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130321215233.518839807@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6709a5813f27..21fe9a142e51 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2632,13 +2632,34 @@ static inline int tsk_is_polling(struct task_struct *p) { return task_thread_info(p)->status & TS_POLLING; } +static inline void current_set_polling(void) +{ + current_thread_info()->status |= TS_POLLING; +} + +static inline void current_clr_polling(void) +{ + current_thread_info()->status &= ~TS_POLLING; + smp_mb__after_clear_bit(); +} #elif defined(TIF_POLLING_NRFLAG) static inline int tsk_is_polling(struct task_struct *p) { return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); } +static inline void current_set_polling(void) +{ + set_thread_flag(TIF_POLLING_NRFLAG); +} + +static inline void current_clr_polling(void) +{ + clear_thread_flag(TIF_POLLING_NRFLAG); +} #else static inline int tsk_is_polling(struct task_struct *p) { return 0; } +static inline void current_set_polling(void) { } +static inline void current_clr_polling(void) { } #endif /* -- cgit v1.2.3 From 9b89f6ba2ab56e4d9c00e7e591d6bc333137895e Mon Sep 17 00:00:00 2001 From: Andrei Epure Date: Thu, 11 Apr 2013 20:30:29 +0300 Subject: sched: Document task_struct::personality field Signed-off-by: Andrei Epure Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1365701429-4721-1-git-send-email-epure.andrei@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9004f6e19eac..6bdaa73ede13 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1105,8 +1105,10 @@ struct task_struct { int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ unsigned int jobctl; /* JOBCTL_*, siglock protected */ - /* ??? */ + + /* Used for emulating ABI behavior of previous Linux versions */ unsigned int personality; + unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ -- cgit v1.2.3 From f2530dc71cf0822f90bb63ea4600caaef33a66bb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Apr 2013 09:33:34 +0200 Subject: kthread: Prevent unpark race which puts threads on the wrong cpu The smpboot threads rely on the park/unpark mechanism which binds per cpu threads on a particular core. Though the functionality is racy: CPU0 CPU1 CPU2 unpark(T) wake_up_process(T) clear(SHOULD_PARK) T runs leave parkme() due to !SHOULD_PARK bind_to(CPU2) BUG_ON(wrong CPU) We cannot let the tasks move themself to the target CPU as one of those tasks is actually the migration thread itself, which requires that it starts running on the target cpu right away. The solution to this problem is to prevent wakeups in park mode which are not from unpark(). That way we can guarantee that the association of the task to the target cpu is working correctly. Add a new task state (TASK_PARKED) which prevents other wakeups and use this state explicitly for the unpark wakeup. Peter noticed: Also, since the task state is visible to userspace and all the parked tasks are still in the PID space, its a good hint in ps and friends that these tasks aren't really there for the moment. The migration thread has another related issue. CPU0 CPU1 Bring up CPU2 create_thread(T) park(T) wait_for_completion() parkme() complete() sched_set_stop_task() schedule(TASK_PARKED) The sched_set_stop_task() call is issued while the task is on the runqueue of CPU1 and that confuses the hell out of the stop_task class on that cpu. So we need the same synchronizaion before sched_set_stop_task(). Reported-by: Dave Jones Reported-and-tested-by: Dave Hansen Reported-and-tested-by: Borislav Petkov Acked-by: Peter Ziljstra Cc: Srivatsa S. Bhat Cc: dhillf@gmail.com Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304091635430.21884@ionos Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..e692a022527b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -163,9 +163,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_DEAD 64 #define TASK_WAKEKILL 128 #define TASK_WAKING 256 -#define TASK_STATE_MAX 512 +#define TASK_PARKED 512 +#define TASK_STATE_MAX 1024 -#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" +#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; -- cgit v1.2.3 From 41fcb9f230bf773656d1768b73000ef720bf00c3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 17 Apr 2013 15:23:11 -0400 Subject: mutex: Move mutex spinning code from sched/core.c back to mutex.c As mentioned by Ingo, the SCHED_FEAT_OWNER_SPIN scheduler feature bit was really just an early hack to make with/without mutex-spinning testable. So it is no longer necessary. This patch removes the SCHED_FEAT_OWNER_SPIN feature bit and move the mutex spinning code from kernel/sched/core.c back to kernel/mutex.c which is where they should belong. Signed-off-by: Waiman Long Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Chandramouleeswaran Aswin Cc: Davidlohr Bueso Cc: Norton Scott J Cc: Rik van Riel Cc: Paul E. McKenney Cc: David Howells Cc: Dave Jones Cc: Clark Williams Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366226594-5506-2-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..aefe45d79f53 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -320,7 +320,6 @@ extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); -extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; -- cgit v1.2.3 From 25f55d9d01ad7a7ad248fd5af1d22675ffd202c5 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 23 Apr 2013 16:59:02 +0200 Subject: sched: Fix init NOHZ_IDLE flag On my SMP platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle - which makes the scheduler unhappy. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu() between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. A new nohz_idle field is added to sched_domain so both status and sched_domain will share the same RCU lifecycle and will be always synchronized. In addition, there is no more need to protect nohz_idle against concurrent access as it is only modified by 2 exclusive functions called by local cpu. This solution has been prefered to the creation of a new struct with an extra pointer indirection for sched_domain. The synchronization is done at the cost of : - An additional indirection and a rcu_dereference for accessing nohz_idle. - We use only the nohz_idle field of the top sched_domain. Signed-off-by: Vincent Guittot Acked-by: Peter Zijlstra Cc: linaro-kernel@lists.linaro.org Cc: peterz@infradead.org Cc: fweisbec@gmail.com Cc: pjt@google.com Cc: rostedt@goodmis.org Cc: efault@gmx.de Link: http://lkml.kernel.org/r/1366729142-14662-1-git-send-email-vincent.guittot@linaro.org [ Fixed !NO_HZ build bug. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6bdaa73ede13..a25168f4ab86 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -808,6 +808,8 @@ struct sched_domain { unsigned int wake_idx; unsigned int forkexec_idx; unsigned int smt_gain; + + int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ int level; -- cgit v1.2.3