From 5307c9556bc17e3cd26d4e94fc3b2565921834de Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 8 May 2012 12:20:58 +0200 Subject: tick: Add tick skew boot option Let the user decide whether power consumption or jitter is the more important consideration for their machines. Quoting removal commit af5ab277ded04bd9bc6b048c5a2f0e7d70ef0867: "Historically, Linux has tried to make the regular timer tick on the various CPUs not happen at the same time, to avoid contention on xtime_lock. Nowadays, with the tickless kernel, this contention no longer happens since time keeping and updating are done differently. In addition, this skew is actually hurting power consumption in a measurable way on many-core systems." Problems: - Contrary to the above, systems do encounter contention on both xtime_lock and RCU structure locks when the tick is synchronized. - Moderate sized RT systems suffer intolerable jitter due to the tick being synchronized. - SGI reports the same for their large systems. - Fully utilized systems reap no power saving benefit from skew removal, but do suffer from resulting induced lock contention. - 0209f649 rcu: limit rcu_node leaf-level fanout This patch was born to combat lock contention which testing showed to have been _induced by_ skew removal. Skew the tick, contention disappeared virtually completely. Signed-off-by: Mike Galbraith Link: http://lkml.kernel.org/r/1336472458.21924.78.camel@marge.simpson.net Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..4eddbb5ea9c5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -814,6 +814,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) return HRTIMER_RESTART; } +static int sched_skew_tick; + /** * tick_setup_sched_timer - setup the tick emulation timer */ @@ -831,6 +833,14 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); + /* Offset the tick to avert xtime_lock contention. */ + if (sched_skew_tick) { + u64 offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, num_possible_cpus()); + offset *= smp_processor_id(); + hrtimer_add_expires_ns(&ts->sched_timer, offset); + } + for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_start_expires(&ts->sched_timer, @@ -910,3 +920,11 @@ int tick_check_oneshot_change(int allow_nohz) tick_nohz_switch_to_nohz(); return 0; } + +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); -- cgit v1.2.3 From e5400321a6f15ce0fe77c8455954f213ef7dcc54 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 9 May 2012 23:39:34 +0900 Subject: clockevents: Make clockevents_config() a global symbol Make clockevents_config() into a global symbol to allow it to be used by compiled-in clockevent drivers. This is needed by drivers that want to update the timer frequency after registration time. Signed-off-by: Magnus Damm Tested-by: Simon Horman Cc: arnd@arndb.de Cc: johnstul@us.ibm.com Cc: rjw@sisk.pl Cc: lethal@linux-sh.org Cc: gregkh@linuxfoundation.org Cc: olof@lixom.net Cc: Magnus Damm Link: http://lkml.kernel.org/r/20120509143934.27521.46553.sendpatchset@w520 Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9cd928f7a7c6..7e1ce012a851 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); -static void clockevents_config(struct clock_event_device *dev, - u32 freq) +void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; -- cgit v1.2.3 From 62cf20b32aee4ae889a2eb40fd41c0eab73de970 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 May 2012 14:08:57 +0200 Subject: tick: Move skew_tick option into the HIGH_RES_TIMER section commit 5307c95 (tick: Add tick skew boot option) broke the !CONFIG_HIGH_RES_TIMERS build. Move the boot option parsing into the CONFIG_HIGH_RES_TIMERS section. Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Mike Galbraith --- kernel/time/tick-sched.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4eddbb5ea9c5..efd386667536 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -816,6 +816,14 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) static int sched_skew_tick; +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); + /** * tick_setup_sched_timer - setup the tick emulation timer */ @@ -920,11 +928,3 @@ int tick_check_oneshot_change(int allow_nohz) tick_nohz_switch_to_nohz(); return 0; } - -static int __init skew_tick(char *str) -{ - get_option(&str, &sched_skew_tick); - - return 0; -} -early_param("skew_tick", skew_tick); -- cgit v1.2.3 From 5aaa0b7a2ed5b12692c9ffb5222182bd558d3146 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 May 2012 17:15:29 +0200 Subject: sched/nohz: Fix rq->cpu_load calculations some more Follow up on commit 556061b00 ("sched/nohz: Fix rq->cpu_load[] calculations") since while that fixed the busy case it regressed the mostly idle case. Add a callback from the nohz exit to also age the rq->cpu_load[] array. This closes the hole where either there was no nohz load balance pass during the nohz, or there was a 'significant' amount of idle time between the last nohz balance and the nohz exit. So we'll update unconditionally from the tick to not insert any accidental 0 load periods while busy, and we try and catch up from nohz idle balance and nohz exit. Both these are still prone to missing a jiffy, but that has always been the case. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Cc: Venkatesh Pallipadi Link: http://lkml.kernel.org/n/tip-kt0trz0apodbf84ucjfdbr1a@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..0c927cd85345 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void) /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); + update_cpu_load_nohz(); #ifndef CONFIG_VIRT_CPU_ACCOUNTING /* -- cgit v1.2.3 From fad0c66c4bb836d57a5f125ecd38bed653ca863a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 30 May 2012 10:54:57 -0700 Subject: timekeeping: Fix CLOCK_MONOTONIC inconsistency during leapsecond Commit 6b43ae8a61 (ntp: Fix leap-second hrtimer livelock) broke the leapsecond update of CLOCK_MONOTONIC. The missing leapsecond update to wall_to_monotonic causes discontinuities in CLOCK_MONOTONIC. Adjust wall_to_monotonic when NTP inserted a leapsecond. Reported-by: Richard Cochran Signed-off-by: John Stultz Tested-by: Richard Cochran Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1338400497-12420-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6e46cacf5969..6f46a00a1e8a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -962,6 +962,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; } /* Accumulate raw time */ @@ -1077,6 +1078,7 @@ static void update_wall_time(void) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; } timekeeping_update(false); -- cgit v1.2.3 From aa9b16306e3243229580ff889cc59fd66bf77973 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 May 2012 16:41:44 -0700 Subject: rcu: Precompute RCU_FAST_NO_HZ timer offsets When a CPU is entering dyntick-idle mode, tick_nohz_stop_sched_tick() calls rcu_needs_cpu() see if RCU needs that CPU, and, if not, computes the next wakeup time based on the timer wheels. Only later, when actually entering the idle loop, rcu_prepare_for_idle() will be invoked. In some cases, rcu_prepare_for_idle() will post timers to wake the CPU back up. But all for naught: The next wakeup time for the CPU has already been computed, and posting a timer afterwards does not force that wakeup time to be recomputed. This means that rcu_prepare_for_idle()'s have no effect. This is not a problem on a busy system because something else will wake up the CPU soon enough. However, on lightly loaded systems, the CPU might stay asleep for a considerable length of time. If that CPU has a callback that the rest of the system is waiting on, the system might run very slowly or (in theory) even hang. This commit avoids this problem by having rcu_needs_cpu() give tick_nohz_stop_sched_tick() an estimate of when RCU will need the CPU to wake back up, which tick_nohz_stop_sched_tick() takes into account when programming the CPU's wakeup time. An alternative approach is for rcu_prepare_for_idle() to use hrtimers instead of normal timers, but timers are much more efficient than are hrtimers for frequently and repeatedly posting and cancelling a given timer, which is exactly what RCU_FAST_NO_HZ does. Reported-by: Pascal Chapperon Reported-by: Heiko Carstens Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/time/tick-sched.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..52f5ebbd443b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; + unsigned long rcu_delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; @@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; @@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; + if (rcu_delta_jiffies < delta_jiffies) { + next_jiffies = last_jiffies + rcu_delta_jiffies; + delta_jiffies = rcu_delta_jiffies; + } } /* * Do not stop the tick, if we are only one off -- cgit v1.2.3