From 69e51e92a394088fc3266ed5136903074b44f3c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 Oct 2015 14:32:34 +0200 Subject: sched/wait: Document waitqueue_active() Kosuku reports that there were a fair number of buggy waitqueue_active() users and this function deserves a big comment in order to avoid growing more. Reported-by: Kosuke Tatsukawa Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/wait.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 1e1bf9f963a9..f3bac30587f7 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -102,6 +102,36 @@ init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) q->func = func; } +/** + * waitqueue_active -- locklessly test for waiters on the queue + * @q: the waitqueue to test for waiters + * + * returns true if the wait list is not empty + * + * NOTE: this function is lockless and requires care, incorrect usage _will_ + * lead to sporadic and non-obvious failure. + * + * Use either while holding wait_queue_head_t::lock or when used for wakeups + * with an extra smp_mb() like: + * + * CPU0 - waker CPU1 - waiter + * + * for (;;) { + * @cond = true; prepare_to_wait(&wq, &wait, state); + * smp_mb(); // smp_mb() from set_current_state() + * if (waitqueue_active(wq)) if (@cond) + * wake_up(wq); break; + * schedule(); + * } + * finish_wait(&wq, &wait); + * + * Because without the explicit smp_mb() it's possible for the + * waitqueue_active() load to get hoisted over the @cond store such that we'll + * observe an empty wait list while the waiter might not observe @cond. + * + * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), + * which (when the lock is uncontended) are of roughly equal cost. + */ static inline int waitqueue_active(wait_queue_head_t *q) { return !list_empty(&q->task_list); -- cgit v1.2.3 From 525705d15e63b7455977408e4601e76e6bc41524 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Tue, 10 Nov 2015 09:36:02 +0900 Subject: sched/fair: Consider missed ticks in NOHZ_FULL in update_cpu_load_nohz() Usually the tick can be stopped for an idle CPU in NOHZ. However in NOHZ_FULL mode, a non-idle CPU's tick can also be stopped. However, update_cpu_load_nohz() does not consider the case a non-idle CPU's tick has been stopped at all. This patch makes the update_cpu_load_nohz() know if the calling path comes from NOHZ_FULL or idle NOHZ. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447115762-19734-3-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index edad7a43edea..f425aac63317 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -177,9 +177,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void update_cpu_load_nohz(void); +extern void update_cpu_load_nohz(int active); #else -static inline void update_cpu_load_nohz(void) { } +static inline void update_cpu_load_nohz(int active) { } #endif extern unsigned long get_parent_ip(unsigned long addr); -- cgit v1.2.3 From 1b034bd989aa4a396c13d305759c376c52595a97 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 17 Nov 2015 18:05:23 +0100 Subject: stop_machine: Make cpu_stop_queue_work() and stop_one_cpu_nowait() return bool Change cpu_stop_queue_work() to return true if the work was queued and change stop_one_cpu_nowait() to return the result of cpu_stop_queue_work(). This makes it more useful, for example now you can alloc cpu_stop_work for stop_one_cpu_nowait() and free it in the callback or if stop_one_cpu_nowait() fails, currently this is impossible because you can't know if @fn will be called or not. Also, this allows to kill cpu_stop_done->executed, see the next changes. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Mike Galbraith Cc: Milos Vyletel Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151117170523.GA13955@redhat.com Signed-off-by: Ingo Molnar --- include/linux/stop_machine.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 0adedca24c5b..9ef42e1f0b3a 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -29,7 +29,7 @@ struct cpu_stop_work { int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); -void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, +bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); @@ -65,7 +65,7 @@ static void stop_one_cpu_nowait_workfn(struct work_struct *work) preempt_enable(); } -static inline void stop_one_cpu_nowait(unsigned int cpu, +static inline bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { @@ -74,7 +74,10 @@ static inline void stop_one_cpu_nowait(unsigned int cpu, work_buf->fn = fn; work_buf->arg = arg; schedule_work(&work_buf->work); + return true; } + + return false; } static inline int stop_cpus(const struct cpumask *cpumask, -- cgit v1.2.3 From 7098c1eac75dc03fdbb7249171a6e68ce6044a5a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 19 Nov 2015 16:47:30 +0100 Subject: sched/cputime: Clarify vtime symbols and document them VTIME_SLEEPING state happens either when: 1) The task is sleeping and no tickless delta is to be added on the task cputime stats. 2) The CPU isn't running vtime at all, so the same properties of 1) applies. Lets rename the vtime symbol to reflect both states. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Hiroshi Shimamoto Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447948054-28668-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f425aac63317..3533168fe7d1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1522,8 +1522,11 @@ struct task_struct { seqlock_t vtime_seqlock; unsigned long long vtime_snap; enum { - VTIME_SLEEPING = 0, + /* Task is sleeping or running in a CPU with VTIME inactive */ + VTIME_INACTIVE = 0, + /* Task runs in userspace in a CPU with VTIME active */ VTIME_USER, + /* Task runs in kernelspace in a CPU with VTIME active */ VTIME_SYS, } vtime_snap_whence; #endif -- cgit v1.2.3 From 55dbdcfa05533f44c9416070b8a9f6432b22314a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 19 Nov 2015 16:47:32 +0100 Subject: sched/cputime: Rename vtime_accounting_enabled() to vtime_accounting_cpu_enabled() vtime_accounting_enabled() checks if vtime is running on the current CPU and is as such a misnomer. Lets rename it to a function that reflect its locality. We are going to need the current name for a function that tells if vtime runs at all on some CPU. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Hiroshi Shimamoto Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447948054-28668-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/context_tracking.h | 4 ++-- include/linux/vtime.h | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 68b575afe5f5..d259274238db 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -86,7 +86,7 @@ static inline void context_tracking_init(void) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN static inline void guest_enter(void) { - if (vtime_accounting_enabled()) + if (vtime_accounting_cpu_enabled()) vtime_guest_enter(current); else current->flags |= PF_VCPU; @@ -100,7 +100,7 @@ static inline void guest_exit(void) if (context_tracking_is_enabled()) __context_tracking_exit(CONTEXT_GUEST); - if (vtime_accounting_enabled()) + if (vtime_accounting_cpu_enabled()) vtime_guest_exit(current); else current->flags &= ~PF_VCPU; diff --git a/include/linux/vtime.h b/include/linux/vtime.h index c5165fd256f9..ca23e8348f70 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -10,14 +10,14 @@ struct task_struct; /* - * vtime_accounting_enabled() definitions/declarations + * vtime_accounting_cpu_enabled() definitions/declarations */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -static inline bool vtime_accounting_enabled(void) { return true; } +static inline bool vtime_accounting_cpu_enabled(void) { return true; } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static inline bool vtime_accounting_enabled(void) +static inline bool vtime_accounting_cpu_enabled(void) { if (context_tracking_is_enabled()) { if (context_tracking_cpu_is_enabled()) @@ -29,7 +29,7 @@ static inline bool vtime_accounting_enabled(void) #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ #ifndef CONFIG_VIRT_CPU_ACCOUNTING -static inline bool vtime_accounting_enabled(void) { return false; } +static inline bool vtime_accounting_cpu_enabled(void) { return false; } #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ @@ -44,7 +44,7 @@ extern void vtime_task_switch(struct task_struct *prev); extern void vtime_common_task_switch(struct task_struct *prev); static inline void vtime_task_switch(struct task_struct *prev) { - if (vtime_accounting_enabled()) + if (vtime_accounting_cpu_enabled()) vtime_common_task_switch(prev); } #endif /* __ARCH_HAS_VTIME_TASK_SWITCH */ @@ -59,7 +59,7 @@ extern void vtime_account_irq_enter(struct task_struct *tsk); extern void vtime_common_account_irq_enter(struct task_struct *tsk); static inline void vtime_account_irq_enter(struct task_struct *tsk) { - if (vtime_accounting_enabled()) + if (vtime_accounting_cpu_enabled()) vtime_common_account_irq_enter(tsk); } #endif /* __ARCH_HAS_VTIME_ACCOUNT */ @@ -78,7 +78,7 @@ extern void vtime_gen_account_irq_exit(struct task_struct *tsk); static inline void vtime_account_irq_exit(struct task_struct *tsk) { - if (vtime_accounting_enabled()) + if (vtime_accounting_cpu_enabled()) vtime_gen_account_irq_exit(tsk); } -- cgit v1.2.3 From e592539466380279a9e6e6fdfe4545aa54f22593 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 19 Nov 2015 16:47:33 +0100 Subject: sched/cputime: Introduce vtime accounting check for readers Readers need to know if vtime runs at all on some CPU somewhere, this is a fast-path check to determine if we need to check further the need to add up any tickless cputime delta. This fast path check uses context tracking state because vtime is tied to context tracking as of now. This check appears to be confusing though so lets use a vtime function that deals with context tracking details in vtime implementation instead. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Hiroshi Shimamoto Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447948054-28668-7-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/vtime.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vtime.h b/include/linux/vtime.h index ca23e8348f70..fa2196990f84 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -17,9 +17,20 @@ static inline bool vtime_accounting_cpu_enabled(void) { return true; } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +/* + * Checks if vtime is enabled on some CPU. Cputime readers want to be careful + * in that case and compute the tickless cputime. + * For now vtime state is tied to context tracking. We might want to decouple + * those later if necessary. + */ +static inline bool vtime_accounting_enabled(void) +{ + return context_tracking_is_enabled(); +} + static inline bool vtime_accounting_cpu_enabled(void) { - if (context_tracking_is_enabled()) { + if (vtime_accounting_enabled()) { if (context_tracking_cpu_is_enabled()) return true; } -- cgit v1.2.3 From b7ce2277f087fd052e7e1bbf432f7fecbee82bb6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 19 Nov 2015 16:47:34 +0100 Subject: sched/cputime: Convert vtime_seqlock to seqcount The cputime can only be updated by the current task itself, even in vtime case. So we can safely use seqcount instead of seqlock as there is no writer concurrency involved. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Hiroshi Shimamoto Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447948054-28668-8-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 2 +- include/linux/sched.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1c1ff7e4faa4..f2cb8d45513d 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -150,7 +150,7 @@ extern struct task_group root_task_group; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN # define INIT_VTIME(tsk) \ - .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \ + .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \ .vtime_snap = 0, \ .vtime_snap_whence = VTIME_SYS, #else diff --git a/include/linux/sched.h b/include/linux/sched.h index 3533168fe7d1..3b0de68bce41 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1519,7 +1519,7 @@ struct task_struct { cputime_t gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - seqlock_t vtime_seqlock; + seqcount_t vtime_seqcount; unsigned long long vtime_snap; enum { /* Task is sleeping or running in a CPU with VTIME inactive */ -- cgit v1.2.3 From 5a1078043f844074cbd53981432778a8d5dd56e9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 8 Dec 2015 21:23:59 +0100 Subject: sched/core: Move sched_entity::avg into separate cache line The sched_entity::avg collides with read-mostly sched_entity data. The perf c2c tool showed many read HITM accesses across many CPUs for sched_entity's cfs_rq and my_q, while having at the same time tons of stores for avg. After placing sched_entity::avg into separate cache line, the perf bench sched pipe showed around 20 seconds speedup. NOTE I cut out all perf events except for cycles and instructions from following output. Before: $ perf stat -r 5 perf bench sched pipe -l 10000000 # Running 'sched/pipe' benchmark: # Executed 10000000 pipe operations between two processes Total time: 270.348 [sec] 27.034805 usecs/op 36989 ops/sec ... 245,537,074,035 cycles # 1.433 GHz 187,264,548,519 instructions # 0.77 insns per cycle 272.653840535 seconds time elapsed ( +- 1.31% ) After: $ perf stat -r 5 perf bench sched pipe -l 10000000 # Running 'sched/pipe' benchmark: # Executed 10000000 pipe operations between two processes Total time: 251.076 [sec] 25.107678 usecs/op 39828 ops/sec ... 244,573,513,928 cycles # 1.572 GHz 187,409,641,157 instructions # 0.76 insns per cycle 251.679315188 seconds time elapsed ( +- 0.31% ) Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Don Zickus Cc: Joe Mario Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1449606239-28602-1-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 791b47e40317..0c0e78102850 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1268,8 +1268,13 @@ struct sched_entity { #endif #ifdef CONFIG_SMP - /* Per entity load average tracking */ - struct sched_avg avg; + /* + * Per entity load average tracking. + * + * Put into separate cache line so it does not + * collide with read-mostly values above. + */ + struct sched_avg avg ____cacheline_aligned_in_smp; #endif }; -- cgit v1.2.3