From 0039962a1473f07fd5c8355bd8264be1eb87eb3e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 3 Jan 2017 13:43:11 -0800 Subject: kernel/exit: Compute 'current' directly This patch effectively replaces the tsk pointer dereference (which is obviously == current), to directly use get_current() macro. In this case, do_exit() always passes current to exit_mm(), hence we can simply get rid of the argument. This is also a performance win on some archs such as x86-64 and ppc64 -- arm64 is no longer an issue. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: mark.rutland@arm.com Link: http://lkml.kernel.org/r/1483479794-14013-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/exit.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 8f14b866f9f6..2385d434a46e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -468,12 +468,12 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct *tsk) +static void exit_mm(void) { - struct mm_struct *mm = tsk->mm; + struct mm_struct *mm = current->mm; struct core_state *core_state; - mm_release(tsk, mm); + mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); @@ -491,7 +491,7 @@ static void exit_mm(struct task_struct *tsk) up_read(&mm->mmap_sem); - self.task = tsk; + self.task = current; self.next = xchg(&core_state->dumper.next, &self); /* * Implies mb(), the result of xchg() must be visible @@ -501,22 +501,22 @@ static void exit_mm(struct task_struct *tsk) complete(&core_state->startup); for (;;) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_task_state(current, TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; freezable_schedule(); } - __set_task_state(tsk, TASK_RUNNING); + __set_task_state(current, TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); - BUG_ON(mm != tsk->active_mm); + BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ - task_lock(tsk); - tsk->mm = NULL; + task_lock(current); + current->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); - task_unlock(tsk); + task_unlock(current); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) @@ -823,7 +823,7 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); - exit_mm(tsk); + exit_mm(); if (group_dead) acct_process(); -- cgit v1.2.3 From 642fa448ae6b3a4e5e8737054a094173405b7643 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 3 Jan 2017 13:43:14 -0800 Subject: sched/core: Remove set_task_state() This is a nasty interface and setting the state of a foreign task must not be done. As of the following commit: be628be0956 ("bcache: Make gc wakeup sane, remove set_task_state()") ... everyone in the kernel calls set_task_state() with current, allowing the helper to be removed. However, as the comment indicates, it is still around for those archs where computing current is more expensive than using a pointer, at least in theory. An important arch that is affected is arm64, however this has been addressed now [1] and performance is up to par making no difference with either calls. Of all the callers, if any, it's the locking bits that would care most about this -- ie: we end up passing a tsk pointer to a lot of the lock slowpath, and setting ->state on that. The following numbers are based on two tests: a custom ad-hoc microbenchmark that just measures latencies (for ~65 million calls) between get_task_state() vs get_current_state(). Secondly for a higher overview, an unlink microbenchmark was used, which pounds on a single file with open, close,unlink combos with increasing thread counts (up to 4x ncpus). While the workload is quite unrealistic, it does contend a lot on the inode mutex or now rwsem. [1] https://lkml.kernel.org/r/1483468021-8237-1-git-send-email-mark.rutland@arm.com == 1. x86-64 == Avg runtime set_task_state(): 601 msecs Avg runtime set_current_state(): 552 msecs vanilla dirty Hmean unlink1-processes-2 36089.26 ( 0.00%) 38977.33 ( 8.00%) Hmean unlink1-processes-5 28555.01 ( 0.00%) 29832.55 ( 4.28%) Hmean unlink1-processes-8 37323.75 ( 0.00%) 44974.57 ( 20.50%) Hmean unlink1-processes-12 43571.88 ( 0.00%) 44283.01 ( 1.63%) Hmean unlink1-processes-21 34431.52 ( 0.00%) 38284.45 ( 11.19%) Hmean unlink1-processes-30 34813.26 ( 0.00%) 37975.17 ( 9.08%) Hmean unlink1-processes-48 37048.90 ( 0.00%) 39862.78 ( 7.59%) Hmean unlink1-processes-79 35630.01 ( 0.00%) 36855.30 ( 3.44%) Hmean unlink1-processes-110 36115.85 ( 0.00%) 39843.91 ( 10.32%) Hmean unlink1-processes-141 32546.96 ( 0.00%) 35418.52 ( 8.82%) Hmean unlink1-processes-172 34674.79 ( 0.00%) 36899.21 ( 6.42%) Hmean unlink1-processes-203 37303.11 ( 0.00%) 36393.04 ( -2.44%) Hmean unlink1-processes-224 35712.13 ( 0.00%) 36685.96 ( 2.73%) == 2. ppc64le == Avg runtime set_task_state(): 938 msecs Avg runtime set_current_state: 940 msecs vanilla dirty Hmean unlink1-processes-2 19269.19 ( 0.00%) 30704.50 ( 59.35%) Hmean unlink1-processes-5 20106.15 ( 0.00%) 21804.15 ( 8.45%) Hmean unlink1-processes-8 17496.97 ( 0.00%) 17243.28 ( -1.45%) Hmean unlink1-processes-12 14224.15 ( 0.00%) 17240.21 ( 21.20%) Hmean unlink1-processes-21 14155.66 ( 0.00%) 15681.23 ( 10.78%) Hmean unlink1-processes-30 14450.70 ( 0.00%) 15995.83 ( 10.69%) Hmean unlink1-processes-48 16945.57 ( 0.00%) 16370.42 ( -3.39%) Hmean unlink1-processes-79 15788.39 ( 0.00%) 14639.27 ( -7.28%) Hmean unlink1-processes-110 14268.48 ( 0.00%) 14377.40 ( 0.76%) Hmean unlink1-processes-141 14023.65 ( 0.00%) 16271.69 ( 16.03%) Hmean unlink1-processes-172 13417.62 ( 0.00%) 16067.55 ( 19.75%) Hmean unlink1-processes-203 15293.08 ( 0.00%) 15440.40 ( 0.96%) Hmean unlink1-processes-234 13719.32 ( 0.00%) 16190.74 ( 18.01%) Hmean unlink1-processes-265 16400.97 ( 0.00%) 16115.22 ( -1.74%) Hmean unlink1-processes-296 14388.60 ( 0.00%) 16216.13 ( 12.70%) Hmean unlink1-processes-320 15771.85 ( 0.00%) 15905.96 ( 0.85%) x86-64 (known to be fast for get_current()/this_cpu_read_stable() caching) and ppc64 (with paca) show similar improvements in the unlink microbenches. The small delta for ppc64 (2ms), does not represent the gains on the unlink runs. In the case of x86, there was a decent amount of variation in the latency runs, but always within a 20 to 50ms increase), ppc was more constant. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: mark.rutland@arm.com Link: http://lkml.kernel.org/r/1483479794-14013-5-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 2385d434a46e..27c68653e2fc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -501,12 +501,12 @@ static void exit_mm(void) complete(&core_state->startup); for (;;) { - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; freezable_schedule(); } - __set_task_state(current, TASK_RUNNING); + __set_current_state(TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); -- cgit v1.2.3 From 8f95c90ceb541a38ac16fec48c05142ef1450c25 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Jan 2017 07:22:25 -0800 Subject: sched/wait, RCU: Introduce rcuwait machinery rcuwait provides support for (single) RCU-safe task wait/wake functionality, with the caveat that it must not be called after exit_notify(), such that we avoid racing with rcu delayed_put_task_struct callbacks, task_struct being rcu unaware in this context -- for which we similarly have task_rcu_dereference() magic, but with different return semantics, which can conflict with the wakeup side. The interfaces are quite straightforward: rcuwait_wait_event() rcuwait_wake_up() More details are in the comments, but it's perhaps worth mentioning at least, that users must provide proper serialization when waiting on a condition, and avoid corrupting a concurrent waiter. Also care must be taken between the task and the condition for when calling the wakeup -- we cannot miss wakeups. When porting users, this is for example, a given when using waitqueues in that everything is done under the q->lock. As such, it can remove sources of non preemptable unbounded work for realtime. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Link: http://lkml.kernel.org/r/1484148146-14210-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/exit.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 27c68653e2fc..a9441da69e29 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -282,6 +283,35 @@ retry: return task; } +void rcuwait_wake_up(struct rcuwait *w) +{ + struct task_struct *task; + + rcu_read_lock(); + + /* + * Order condition vs @task, such that everything prior to the load + * of @task is visible. This is the condition as to why the user called + * rcuwait_trywake() in the first place. Pairs with set_current_state() + * barrier (A) in rcuwait_wait_event(). + * + * WAIT WAKE + * [S] tsk = current [S] cond = true + * MB (A) MB (B) + * [L] cond [L] tsk + */ + smp_rmb(); /* (B) */ + + /* + * Avoid using task_rcu_dereference() magic as long as we are careful, + * see comment in rcuwait_wait_event() regarding ->exit_state. + */ + task = rcu_dereference(w->task); + if (task) + wake_up_process(task); + rcu_read_unlock(); +} + struct task_struct *try_get_task_struct(struct task_struct **ptask) { struct task_struct *task; -- cgit v1.2.3