From 91e5a2170e795989da9f90c18ba18984f23acc5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 Apr 2015 21:02:22 +0000 Subject: hrtimer: Document hrtimer_forward[_now]() proper Document the calling context conditions. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20150413210035.178751779@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 76d4bd962b19..b1a74ee3e99a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -801,6 +801,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { -- cgit v1.2.3 From d9f0acdeef48570c4e6159d3108f12b64571392e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 14 Apr 2015 21:08:25 +0000 Subject: hrtimer: Update active_bases before calling hrtimer_force_reprogram() 'active_bases' indicates which clock-base have active timer. The intention of this bit field was to avoid evaluating inactive bases. It was introduced with the introduction of the BOOTTIME and TAI clock bases, but it was never brought into full use. We want to use it now, but in __remove_hrtimer() the update happens after the calling hrtimer_force_reprogram() which has to evaluate all clock bases for the next expiring timer. So in case the last timer of a clock base got removed we still see the active bit and therefor evaluate the clock base for no value. There are further optimizations possible when active_bases is updated in the right place. Move the update before the call to hrtimer_force_reprogram() [ tglx: Massaged changelog ] Signed-off-by: Viresh Kumar Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/20150414203500.533438642@linutronix.de Link: http://lkml.kernel.org/r/c7c8ebcd9ed88bb09d76059c745a1fafb48314e7.1428039899.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index b1a74ee3e99a..9abd50b60e83 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -887,6 +887,9 @@ static void __remove_hrtimer(struct hrtimer *timer, next_timer = timerqueue_getnext(&base->active); timerqueue_del(&base->active, &timer->node); + if (!timerqueue_getnext(&base->active)) + base->cpu_base->active_bases &= ~(1 << base->index); + if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ @@ -900,8 +903,6 @@ static void __remove_hrtimer(struct hrtimer *timer, } #endif } - if (!timerqueue_getnext(&base->active)) - base->cpu_base->active_bases &= ~(1 << base->index); out: timer->state = newstate; } -- cgit v1.2.3 From 398ca17fb54b212cdc9da7ff4a17a35c48dd2103 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:27 +0000 Subject: hrtimer: Get rid of the resolution field in hrtimer_clock_base The field has no value because all clock bases have the same resolution. The resolution only changes when we switch to high resolution timer mode. We can evaluate that from a single static variable as well. In the !HIGHRES case its simply a constant. Export the variable, so we can simplify the usage sites. Signed-off-by: Thomas Gleixner Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.645454122@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9abd50b60e83..965687a1fce6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -66,7 +66,6 @@ */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { @@ -74,25 +73,21 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, - .resolution = KTIME_LOW_RES, }, } }; @@ -478,6 +473,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) * High resolution timer enabled ? */ static int hrtimer_hres_enabled __read_mostly = 1; +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; +EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode @@ -660,7 +657,7 @@ static void retrigger_next_event(void *arg) */ static int hrtimer_switch_to_hres(void) { - int i, cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; @@ -676,8 +673,7 @@ static int hrtimer_switch_to_hres(void) return 0; } base->hres_active = 1; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) - base->clock_base[i].resolution = KTIME_HIGH_RES; + hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ @@ -820,8 +816,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) if (delta.tv64 < 0) return 0; - if (interval.tv64 < timer->base->resolution.tv64) - interval.tv64 = timer->base->resolution.tv64; + if (interval.tv64 < hrtimer_resolution) + interval.tv64 = hrtimer_resolution; if (unlikely(delta.tv64 >= interval.tv64)) { s64 incr = ktime_to_ns(interval); @@ -963,7 +959,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * timeouts. This will go away with the GTOD framework. */ #ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, base->resolution); + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); #endif } @@ -1193,12 +1189,8 @@ EXPORT_SYMBOL_GPL(hrtimer_init); */ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { - struct hrtimer_cpu_base *cpu_base; - int base = hrtimer_clockid_to_base(which_clock); - - cpu_base = raw_cpu_ptr(&hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); - + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; return 0; } EXPORT_SYMBOL_GPL(hrtimer_get_res); -- cgit v1.2.3 From 056a3cacbc46e5aca27b350ce4ecb3b33ebb0700 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:32 +0000 Subject: hrtimer: Get rid of hrtimer_get_res() The resolution is directly accessible now. So its simpler just to fill in the values of the timespec and be done with it. Text size reduction (combined with "hrtimer: Get rid of the resolution field in hrtimer_clock_base"): x8664 -61, i386 -221, ARM -60, power64 -48 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.879888080@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 965687a1fce6..73131dab787e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1179,22 +1179,6 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); -/** - * hrtimer_get_res - get the timer resolution for a clock - * @which_clock: which clock to query - * @tp: pointer to timespec variable to store the resolution - * - * Store the resolution of the clock selected by @which_clock in the - * variable pointed to by @tp. - */ -int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) -{ - tp->tv_sec = 0; - tp->tv_nsec = hrtimer_resolution; - return 0; -} -EXPORT_SYMBOL_GPL(hrtimer_get_res); - static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) { struct hrtimer_clock_base *base = timer->base; -- cgit v1.2.3 From a6ffebce7f89f6f97cc22838a5d4383b15d6774f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:34 +0000 Subject: hrtimer: Make the statistics fields smaller No point in having usigned long for /proc/timer_list statistics. Make them unsigned int. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.959773467@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 73131dab787e..874e0914eb3c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1327,8 +1327,8 @@ retry: cpu_base->hang_detected = 1; raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); - if (delta.tv64 > cpu_base->max_hang_time.tv64) - cpu_base->max_hang_time = delta; + if ((unsigned int)delta.tv64 > cpu_base->max_hang_time) + cpu_base->max_hang_time = (unsigned int) delta.tv64; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. -- cgit v1.2.3 From 21d6d52a1b7028e6a6840bd82e354aefa9a5e203 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:35 +0000 Subject: hrtimer: Get rid of softirq time The softirq time field in the clock bases is an optimization from the early days of hrtimers. It provides a coarse "jiffies" like time mostly for self rearming timers. But that comes with a price: - Larger code size - Extra storage space - Duplicated functions with really small differences The benefit of this is optimization is marginal for contemporary systems. Consolidate everything on the high resolution timer implementation. This makes further optimizations possible. Text size reduction: x8664 -95, i386 -356, ARM -148, ARM64 -40, power64 -16 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.039977424@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 148 ++++++++++++++++++++------------------------------ 1 file changed, 59 insertions(+), 89 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 874e0914eb3c..9e111dd83ca3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -104,27 +104,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) return hrtimer_clock_to_base_table[clock_id]; } - -/* - * Get the coarse grained time at the softirq based on xtime and - * wall_to_monotonic. - */ -static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) -{ - ktime_t xtim, mono, boot, tai; - ktime_t off_real, off_boot, off_tai; - - mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); - boot = ktime_add(mono, off_boot); - xtim = ktime_add(mono, off_real); - tai = ktime_add(mono, off_tai); - - base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; - base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; - base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; - base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; -} - /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -466,6 +445,15 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) } #endif +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; + + return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -516,7 +504,12 @@ static inline int hrtimer_hres_active(void) static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { - ktime_t expires_next = __hrtimer_get_next_event(cpu_base); + ktime_t expires_next; + + if (!cpu_base->hres_active) + return; + + expires_next = __hrtimer_get_next_event(cpu_base); if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) return; @@ -625,15 +618,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) -{ - ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; - ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; - ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - - return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); -} - /* * Retrigger next event is called after clock was set * @@ -1179,10 +1163,10 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); -static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) +static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer_clock_base *base, + struct hrtimer *timer, ktime_t *now) { - struct hrtimer_clock_base *base = timer->base; - struct hrtimer_cpu_base *cpu_base = base->cpu_base; enum hrtimer_restart (*fn)(struct hrtimer *); int restart; @@ -1219,34 +1203,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) timer->state &= ~HRTIMER_STATE_CALLBACK; } -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer interrupt - * Called with interrupts disabled - */ -void hrtimer_interrupt(struct clock_event_device *dev) +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) { - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t expires_next, now, entry_time, delta; - int i, retries = 0; - - BUG_ON(!cpu_base->hres_active); - cpu_base->nr_events++; - dev->next_event.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); - entry_time = now = hrtimer_update_base(cpu_base); -retry: - cpu_base->in_hrtirq = 1; - /* - * We set expires_next to KTIME_MAX here with cpu_base->lock - * held to prevent that a timer is enqueued in our queue via - * the migration code. This does not affect enqueueing of - * timers which run their callback and need to be requeued on - * this CPU. - */ - cpu_base->expires_next.tv64 = KTIME_MAX; + int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *base; @@ -1279,9 +1238,42 @@ retry: if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) break; - __run_hrtimer(timer, &basenow); + __run_hrtimer(cpu_base, base, timer, &basenow); } } +} + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next, now, entry_time, delta; + int retries = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + raw_spin_lock(&cpu_base->lock); + entry_time = now = hrtimer_update_base(cpu_base); +retry: + cpu_base->in_hrtirq = 1; + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + + __hrtimer_run_queues(cpu_base, now); + /* Reevaluate the clock bases for the next expiry */ expires_next = __hrtimer_get_next_event(cpu_base); /* @@ -1416,38 +1408,16 @@ void hrtimer_run_pending(void) */ void hrtimer_run_queues(void) { - struct timerqueue_node *node; struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *base; - int index, gettime = 1; + ktime_t now; if (hrtimer_hres_active()) return; - for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { - base = &cpu_base->clock_base[index]; - if (!timerqueue_getnext(&base->active)) - continue; - - if (gettime) { - hrtimer_get_softirq_time(cpu_base); - gettime = 0; - } - - raw_spin_lock(&cpu_base->lock); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= - hrtimer_get_expires_tv64(timer)) - break; - - __run_hrtimer(timer, &base->softirq_time); - } - raw_spin_unlock(&cpu_base->lock); - } + raw_spin_lock(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now); + raw_spin_unlock(&cpu_base->lock); } /* -- cgit v1.2.3 From 868a3e915f7f5eba8f8cb4f7da2276760807c51c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:37 +0000 Subject: hrtimer: Make offset update smarter On every tick/hrtimer interrupt we update the offset variables of the clock bases. That's silly because these offsets change very seldom. Add a sequence counter to the time keeping code which keeps track of the offset updates (clock_was_set()). Have a sequence cache in the hrtimer cpu bases to evaluate whether the offsets must be updated or not. This allows us later to avoid pointless cacheline pollution. Signed-off-by: Thomas Gleixner Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Link: http://lkml.kernel.org/r/20150414203501.132820245@linutronix.de Signed-off-by: Thomas Gleixner Cc: John Stultz --- kernel/time/hrtimer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9e111dd83ca3..8ce9b3138017 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -451,7 +451,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); + return ktime_get_update_offsets_now(&base->clock_was_set_seq, + offs_real, offs_boot, offs_tai); } /* High resolution timer related functions */ -- cgit v1.2.3 From e19ffe8be2cd0a1f726b235443eba21e64f6be5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:39 +0000 Subject: hrtimer: Use bits for various boolean indicators No point in wasting 12 byte storage space. Generates better code as well. Text size reduction: x8664 -64, i386 -16, ARM -132, ARM64 -0, power64 -48 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.227955358@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8ce9b3138017..9bbfe33f6575 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -492,9 +492,14 @@ static inline int hrtimer_is_hres_enabled(void) /* * Is the high resolution mode active ? */ +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +{ + return cpu_base->hres_active; +} + static inline int hrtimer_hres_active(void) { - return __this_cpu_read(hrtimer_bases.hres_active); + return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } /* @@ -628,7 +633,7 @@ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - if (!hrtimer_hres_active()) + if (!base->hres_active) return; raw_spin_lock(&base->lock); @@ -685,6 +690,7 @@ void clock_was_set_delayed(void) #else +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } @@ -862,25 +868,27 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { + struct hrtimer_cpu_base *cpu_base = base->cpu_base; struct timerqueue_node *next_timer; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) goto out; next_timer = timerqueue_getnext(&base->active); timerqueue_del(&base->active, &timer->node); if (!timerqueue_getnext(&base->active)) - base->cpu_base->active_bases &= ~(1 << base->index); + cpu_base->active_bases &= ~(1 << base->index); if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) { + if (reprogram && cpu_base->hres_active) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (base->cpu_base->expires_next.tv64 == expires.tv64) - hrtimer_force_reprogram(base->cpu_base, 1); + if (cpu_base->expires_next.tv64 == expires.tv64) + hrtimer_force_reprogram(cpu_base, 1); } #endif } @@ -1114,7 +1122,7 @@ ktime_t hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) + if (!__hrtimer_hres_active(cpu_base)) mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), ktime_get()); @@ -1412,7 +1420,7 @@ void hrtimer_run_queues(void) struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t now; - if (hrtimer_hres_active()) + if (__hrtimer_hres_active(cpu_base)) return; raw_spin_lock(&cpu_base->lock); -- cgit v1.2.3 From 34aee88a02ba296a8e8c9523cdf77147731903f1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:41 +0000 Subject: hrtimer: Use cpu_base->active_base for hotpath iterators The active_bases field is guaranteed to be in sync with the timerqueue of the corresponding clock base. So we can use it for iterating over the clock bases. This allows to break out early if no more active clock bases are available and avoids touching the cache lines of inactive clock bases. Signed-off-by: Thomas Gleixner Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.322887675@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9bbfe33f6575..fce0ccf97b51 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -419,16 +419,16 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; - int i; + unsigned int active = cpu_base->active_bases; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + for (; active; base++, active >>= 1) { struct timerqueue_node *next; struct hrtimer *timer; - next = timerqueue_getnext(&base->active); - if (!next) + if (!(active & 0x01)) continue; + next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires.tv64 < expires_next.tv64) @@ -1214,17 +1214,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) { - int i; + struct hrtimer_clock_base *base = cpu_base->clock_base; + unsigned int active = cpu_base->active_bases; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - struct hrtimer_clock_base *base; + for (; active; base++, active >>= 1) { struct timerqueue_node *node; ktime_t basenow; - if (!(cpu_base->active_bases & (1 << i))) + if (!(active & 0x01)) continue; - base = cpu_base->clock_base + i; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { -- cgit v1.2.3 From b97f44c9b658d52e0139c947ea5519e51ba38d81 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:47 +0000 Subject: hrtimer: Make use of timerqueue_add/del return values Use the return value instead of reevaluating the information. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.658152945@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fce0ccf97b51..0cd1e0b8099d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -842,7 +842,6 @@ static int enqueue_hrtimer(struct hrtimer *timer, { debug_activate(timer); - timerqueue_add(&base->active, &timer->node); base->cpu_base->active_bases |= 1 << base->index; /* @@ -851,7 +850,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, */ timer->state |= HRTIMER_STATE_ENQUEUED; - return (&timer->node == base->active.next); + return timerqueue_add(&base->active, &timer->node); } /* @@ -875,8 +874,7 @@ static void __remove_hrtimer(struct hrtimer *timer, goto out; next_timer = timerqueue_getnext(&base->active); - timerqueue_del(&base->active, &timer->node); - if (!timerqueue_getnext(&base->active)) + if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); if (&timer->node == next_timer) { -- cgit v1.2.3 From 895bdfa793f6e912d1a58fc445b3dd4d686f7bd3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:49 +0000 Subject: hrtimer: Keep pointer to first timer and simplify __remove_hrtimer() __remove_hrtimer() needs to evaluate the expiry time to figure out whether the timer which is removed is eventually the first expiring timer on the cpu. Keep a pointer to it, which is lazily updated, so we can avoid the evaluation dance and retrieve the information from there. Generates slightly better code. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.752838019@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0cd1e0b8099d..30178d0656cf 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -415,12 +415,21 @@ static inline void debug_deactivate(struct hrtimer *timer) } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *timer) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + cpu_base->next_timer = timer; +#endif +} + static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; unsigned int active = cpu_base->active_bases; + hrtimer_update_next_timer(cpu_base, NULL); for (; active; base++, active >>= 1) { struct timerqueue_node *next; struct hrtimer *timer; @@ -431,8 +440,10 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (expires.tv64 < expires_next.tv64) + if (expires.tv64 < expires_next.tv64) { expires_next = expires; + hrtimer_update_next_timer(cpu_base, timer); + } } /* * clock_was_set() might have changed base->offset of any of @@ -597,6 +608,8 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (cpu_base->in_hrtirq) return 0; + cpu_base->next_timer = timer; + /* * If a hang was detected in the last timer interrupt then we * do not schedule a timer which is earlier than the expiry @@ -868,30 +881,27 @@ static void __remove_hrtimer(struct hrtimer *timer, unsigned long newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - struct timerqueue_node *next_timer; + unsigned int state = timer->state; - if (!(timer->state & HRTIMER_STATE_ENQUEUED)) - goto out; + timer->state = newstate; + if (!(state & HRTIMER_STATE_ENQUEUED)) + return; - next_timer = timerqueue_getnext(&base->active); if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); - if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS - /* Reprogram the clock event device. if enabled */ - if (reprogram && cpu_base->hres_active) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (cpu_base->expires_next.tv64 == expires.tv64) - hrtimer_force_reprogram(cpu_base, 1); - } + /* + * Note: If reprogram is false we do not update + * cpu_base->next_timer. This happens when we remove the first + * timer on a remote cpu. No harm as we never dereference + * cpu_base->next_timer. So the worst thing what can happen is + * an superflous call to hrtimer_force_reprogram() on the + * remote cpu later on if the same timer gets enqueued again. + */ + if (reprogram && timer == cpu_base->next_timer) + hrtimer_force_reprogram(cpu_base, 1); #endif - } -out: - timer->state = newstate; } /* -- cgit v1.2.3 From c6eb3f70d4482806dc2d3e1e3c7736f497b1d418 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:51 +0000 Subject: hrtimer: Get rid of hrtimer softirq hrtimer softirq is a leftover from the initial implementation and serves only the purpose to handle the enqueueing of already expired timers in the high resolution timer mode. We discussed whether we change the return value and force all start sites to handle that the timer is already expired, but that would be a Herculean task and I'm not sure whether its a good idea to enforce that handling on everyone. A simpler solution is to enforce a timer interrupt instead of raising and scheduling a softirq. Just use the existing infrastructure to do so and remove all the softirq leftovers. The HRTIMER softirq enum is now unused, but kept around because trace parsers rely on the existing numbering. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.840834708@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 163 +++++++++++++------------------------------------- 1 file changed, 43 insertions(+), 120 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30178d0656cf..fc6b6d25f93d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -555,59 +555,48 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) } /* - * Shared reprogramming for clock_realtime and clock_monotonic - * * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * - * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming - * and no expiry check happens. The timer gets enqueued into the rbtree. The - * reprogramming and expiry check is done in the hrtimer_interrupt or in the - * softirq. - * * Called with interrupts disabled and base->cpu_base.lock held */ -static int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static void hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - int res; WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* - * When the callback is running, we do not reprogram the clock event - * device. The timer callback is either running on a different CPU or - * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled either by the softirq, which called the - * callback or at the end of the hrtimer_interrupt. + * If the timer is not on the current cpu, we cannot reprogram + * the other cpus clock event device. */ - if (hrtimer_callback_running(timer)) - return 0; + if (base->cpu_base != cpu_base) + return; + + /* + * If the hrtimer interrupt is running, then it will + * reevaluate the clock bases and reprogram the clock event + * device. The callbacks are always executed in hard interrupt + * context so we don't need an extra check for a running + * callback. + */ + if (cpu_base->in_hrtirq) + return; /* * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Nothing wrong - * about that, just avoid to call into the tick code, which - * has now objections against negative expiry values. + * expiry time which is less than base->offset. Set it to 0. */ if (expires.tv64 < 0) - return -ETIME; + expires.tv64 = 0; if (expires.tv64 >= cpu_base->expires_next.tv64) - return 0; - - /* - * When the target cpu of the timer is currently executing - * hrtimer_interrupt(), then we do not touch the clock event - * device. hrtimer_interrupt() will reevaluate all clock bases - * before reprogramming the device. - */ - if (cpu_base->in_hrtirq) - return 0; + return; + /* Update the pointer to the next expiring timer */ cpu_base->next_timer = timer; /* @@ -617,15 +606,14 @@ static int hrtimer_reprogram(struct hrtimer *timer, * to make progress. */ if (cpu_base->hang_detected) - return 0; + return; /* - * Clockevents returns -ETIME, when the event was in the past. + * Program the timer hardware. We enforce the expiry for + * events which are already in the past. */ - res = tick_program_event(expires, 0); - if (!IS_ERR_VALUE(res)) - cpu_base->expires_next = expires; - return res; + cpu_base->expires_next = expires; + tick_program_event(expires, 1); } /* @@ -660,19 +648,11 @@ static void retrigger_next_event(void *arg) */ static int hrtimer_switch_to_hres(void) { - int cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); - unsigned long flags; - - if (base->hres_active) - return 1; - - local_irq_save(flags); + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { - local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); + "mode on CPU %d\n", base->cpu); return 0; } base->hres_active = 1; @@ -681,7 +661,6 @@ static int hrtimer_switch_to_hres(void) tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); - local_irq_restore(flags); return 1; } @@ -984,26 +963,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * on dynticks target. */ wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && - hrtimer_reprogram(timer, new_base)) { - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - if (wakeup) { - /* - * We need to drop cpu_base->lock to avoid a - * lock ordering issue vs. rq->lock. - */ - raw_spin_unlock(&new_base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - local_irq_restore(flags); - return ret; - } else { - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); - } + } else { + hrtimer_reprogram(timer, new_base); } unlock_hrtimer_base(timer, &flags); @@ -1354,7 +1315,7 @@ retry: * local version of hrtimer_peek_ahead_timers() called with interrupts * disabled. */ -static void __hrtimer_peek_ahead_timers(void) +static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; @@ -1366,29 +1327,6 @@ static void __hrtimer_peek_ahead_timers(void) hrtimer_interrupt(td->evtdev); } -/** - * hrtimer_peek_ahead_timers -- run soft-expired timers now - * - * hrtimer_peek_ahead_timers will peek at the timer queue of - * the current cpu and check if there are any timers for which - * the soft expires time has passed. If any such timers exist, - * they are run immediately and then removed from the timer queue. - * - */ -void hrtimer_peek_ahead_timers(void) -{ - unsigned long flags; - - local_irq_save(flags); - __hrtimer_peek_ahead_timers(); - local_irq_restore(flags); -} - -static void run_hrtimer_softirq(struct softirq_action *h) -{ - hrtimer_peek_ahead_timers(); -} - #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } @@ -1396,31 +1334,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* - * Called from timer softirq every jiffy, expire hrtimers: - * - * For HRT its the fall back code to run the softirq in the timer - * softirq context in case the hrtimer initialization failed or has - * not been done yet. - */ -void hrtimer_run_pending(void) -{ - if (hrtimer_hres_active()) - return; - - /* - * This _is_ ugly: We have to check in the softirq context, - * whether we can switch to highres and / or nohz mode. The - * clocksource switch happens in the timer interrupt with - * xtime_lock held. Notification from there only sets the - * check bit in the tick_oneshot code, otherwise we might - * deadlock vs. xtime_lock. - */ - if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - hrtimer_switch_to_hres(); -} - -/* - * Called from hardirq context every jiffy + * Called from run_local_timers in hardirq context every jiffy */ void hrtimer_run_queues(void) { @@ -1430,6 +1344,18 @@ void hrtimer_run_queues(void) if (__hrtimer_hres_active(cpu_base)) return; + /* + * This _is_ ugly: We have to check periodically, whether we + * can switch to highres and / or nohz mode. The clocksource + * switch happens with xtime_lock held. Notification from + * there only sets the check bit in the tick_oneshot code, + * otherwise we might deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { + hrtimer_switch_to_hres(); + return; + } + raw_spin_lock(&cpu_base->lock); now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now); @@ -1700,9 +1626,6 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif } /** -- cgit v1.2.3 From c1ad348b452aacd784fb97403d03d71723c72ee1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:58 +0000 Subject: tick: Nohz: Rework next timer evaluation The evaluation of the next timer in the nohz code is based on jiffies while all the tick internals are nano seconds based. We have also to convert hrtimer nanoseconds to jiffies in the !highres case. That's just wrong and introduces interesting corner cases. Turn it around and convert the next timer wheel timer expiry and the rcu event to clock monotonic and base all calculations on nanoseconds. That identifies the case where no timer is pending clearly with an absolute expiry value of KTIME_MAX. Makes the code more readable and gets rid of the jiffies magic in the nohz code. Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: John Stultz Cc: Marcelo Tosatti Link: http://lkml.kernel.org/r/20150414203502.184198593@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fc6b6d25f93d..179b991cfdcb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1080,26 +1080,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); /** * hrtimer_get_next_event - get the time until next expiry event * - * Returns the delta to the next expiry event or KTIME_MAX if no timer - * is pending. + * Returns the next expiry time or KTIME_MAX if no timer is pending. */ -ktime_t hrtimer_get_next_event(void) +u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t mindelta = { .tv64 = KTIME_MAX }; + u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!__hrtimer_hres_active(cpu_base)) - mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), - ktime_get()); + expires = __hrtimer_get_next_event(cpu_base).tv64; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - if (mindelta.tv64 < 0) - mindelta.tv64 = 0; - return mindelta; + return expires; } #endif -- cgit v1.2.3 From 58f1f803f1d6ef9ab280de13246d65970a09cb95 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:08 +0000 Subject: hrtimer: Get rid of __hrtimer_start_range_ns() No more callers. Remove the leftovers. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.707871492@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 179b991cfdcb..88d6ea25dde4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -916,9 +916,20 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) return 0; } -int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode, - int wakeup) +/** + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -971,25 +982,6 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, return ret; } -EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); - -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); -} EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** @@ -1006,7 +998,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { - return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); + return hrtimer_start_range_ns(timer, tim, 0, mode); } EXPORT_SYMBOL_GPL(hrtimer_start); -- cgit v1.2.3 From 02a171af1a46966dcdb5b38cdc33e4f43e92c778 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:10 +0000 Subject: hrtimer: Make hrtimer_start() a inline wrapper No point for an extra export just to set the extra argument of hrtimer_start_range_ns() to 0. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.808544539@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88d6ea25dde4..e5cf71aa6d77 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -984,25 +984,6 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); -/** - * hrtimer_start - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) -{ - return hrtimer_start_range_ns(timer, tim, 0, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_start); - - /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop -- cgit v1.2.3 From 3f7b349ac14885472a19c46840235114e5ad5e52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:11 +0000 Subject: hrtimer: Remove bogus hrtimer_active() check The check for hrtimer_active() after starting the timer is pointless. If the timer is inactive it has expired already and therefor the task pointer is already NULL. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.907149271@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e5cf71aa6d77..c38f0b6024b4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1361,8 +1361,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t->timer, mode); - if (!hrtimer_active(&t->timer)) - t->task = NULL; if (likely(t->task)) freezable_schedule(); @@ -1633,8 +1631,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, hrtimer_init_sleeper(&t, current); hrtimer_start_expires(&t.timer, mode); - if (!hrtimer_active(&t.timer)) - t.task = NULL; if (likely(t.task)) schedule(); -- cgit v1.2.3 From 61699e13072a89880aa584dcc64c6da465fb2ccc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:23 +0000 Subject: hrtimer: Remove hrtimer_start() return value No user was ever interested whether the timer was active or not when it was started. All abusers of the return value are gone, so get rid of it. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203503.483556394@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c38f0b6024b4..beab02d3ff1e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -923,22 +923,18 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * @delta_ns: "slack" range for the timer * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active */ -int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret, leftmost; + int leftmost; base = lock_hrtimer_base(timer, &flags); /* Remove an active timer from the queue: */ - ret = remove_hrtimer(timer, base); + remove_hrtimer(timer, base); if (mode & HRTIMER_MODE_REL) { tim = ktime_add_safe(tim, base->get_time()); @@ -962,11 +958,8 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, timer_stats_hrtimer_set_start_info(timer); leftmost = enqueue_hrtimer(timer, new_base); - - if (!leftmost) { - unlock_hrtimer_base(timer, &flags); - return ret; - } + if (!leftmost) + goto unlock; if (!hrtimer_is_hres_active(timer)) { /* @@ -977,10 +970,8 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } else { hrtimer_reprogram(timer, new_base); } - +unlock: unlock_hrtimer_base(timer, &flags); - - return ret; } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); -- cgit v1.2.3 From 19d9f4225dd6a47fca430f15eeae345ceb95c301 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:25 +0000 Subject: hrtimer: Avoid locking in hrtimer_cancel() if timer not active We can do a lockless check for hrtimer_active before actually taking the lock in hrtimer[_try_to]_cancel. This is useful for hotpath users like nanosleep as they avoid the lock dance when the timer has expired. This is safe because active is true when the timer is enqueued or the callback is running. Taking the hrtimer base lock does not protect against concurrent hrtimer_start calls, the callsite has to do the proper serialization itself. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203503.580273114@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index beab02d3ff1e..3bac94269a98 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -991,6 +991,15 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) unsigned long flags; int ret = -1; + /* + * Check lockless first. If the timer is not active (neither + * enqueued nor running the callback, nothing to do here. The + * base lock does not serialize against a concurrent enqueue, + * so we can avoid taking it. + */ + if (!hrtimer_active(timer)) + return 0; + base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) -- cgit v1.2.3 From 5de2755c8c8b3a6b8414870e2c284914a2b42e4d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 May 2014 15:49:48 +0200 Subject: hrtimer: Allow concurrent hrtimer_start() for self restarting timers Because we drop cpu_base->lock around calling hrtimer::function, it is possible for hrtimer_start() to come in between and enqueue the timer. If hrtimer::function then returns HRTIMER_RESTART we'll hit the BUG_ON because HRTIMER_STATE_ENQUEUED will be set. Since the above is a perfectly valid scenario, remove the BUG_ON and make the enqueue_hrtimer() call conditional on the timer not being enqueued already. NOTE: in that concurrent scenario its entirely common for both sites to want to modify the hrtimer, since hrtimers don't provide serialization themselves be sure to provide some such that the hrtimer::function and the hrtimer_start() caller don't both try and fudge the expiration state at the same time. To that effect, add a WARN when someone tries to forward an already enqueued timer, the most common way to change the expiry of self restarting timers. Ideally we'd put the WARN in everything modifying the expiry but most of that is inlines and we don't need the bloat. Fixes: 2d44ae4d7135 ("hrtimer: clean up cpu->base locking tricks") Signed-off-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Roman Gushchin Cc: Paul Turner Link: http://lkml.kernel.org/r/20150415113105.GT5029@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3bac94269a98..4adf32067862 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -799,6 +799,9 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) if (delta.tv64 < 0) return 0; + if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + return 0; + if (interval.tv64 < hrtimer_resolution) interval.tv64 = hrtimer_resolution; @@ -1139,11 +1142,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, * Note: We clear the CALLBACK bit after enqueue_hrtimer and * we do not reprogramm the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() + * + * Note: Because we dropped the cpu_base->lock above, + * hrtimer_start_range_ns() can have popped in and enqueued the timer + * for us already. */ - if (restart != HRTIMER_NORESTART) { - BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + if (restart != HRTIMER_NORESTART && + !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base); - } WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); -- cgit v1.2.3 From d25408756accbd2171abaa0678f986adae139e6f Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 3 Apr 2015 09:04:05 +0530 Subject: clockevents: Stop unused clockevent devices To avoid getting spurious interrupts on a tickless CPU, clockevent device can now be stopped by switching to ONESHOT_STOPPED state. The natural place for handling this transition is tick_program_event(). On 'expires == KTIME_MAX', we skip programming the event and so we need to fix such call sites as well, to always call tick_program_event() irrespective of the expires value. Once the clockevent device is required again, check if it was earlier put into ONESHOT_STOPPED state. If yes, switch its state to ONESHOT before programming its event. To make sure we haven't missed any corner case, add a WARN() for the case where we try to reprogram clockevent device while we aren't configured in ONESHOT_STOPPED state. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: Frederic Weisbecker Cc: Kevin Hilman Cc: Daniel Lezcano Cc: Preeti U Murthy Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/5146b07be7f0bc497e0ebae036590ec2fa73e540.1428031396.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4adf32067862..278d4b36fd94 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -550,8 +550,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) if (cpu_base->hang_detected) return; - if (cpu_base->expires_next.tv64 != KTIME_MAX) - tick_program_event(cpu_base->expires_next, 1); + tick_program_event(cpu_base->expires_next, 1); } /* @@ -1237,8 +1236,7 @@ retry: raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ - if (expires_next.tv64 == KTIME_MAX || - !tick_program_event(expires_next, 0)) { + if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } -- cgit v1.2.3 From c04dca02bc73096435a5c36efd5ccb2171edcbe1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 11 Jun 2015 14:46:44 +0200 Subject: hrtimer: Remove HRTIMER_STATE_MIGRATE I do not understand HRTIMER_STATE_MIGRATE. Unless I am totally confused it looks buggy and simply unneeded. migrate_hrtimer_list() sets it to keep hrtimer_active() == T, but this is not enough: this can fool, say, hrtimer_is_queued() in dequeue_signal(). Can't migrate_hrtimer_list() simply use HRTIMER_STATE_ENQUEUED? This fixes the race and we can kill STATE_MIGRATE. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: wanpeng.li@linux.intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.072387650@infradead.org Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 278d4b36fd94..b1b795e5e0b1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1508,11 +1508,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, debug_deactivate(timer); /* - * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ - __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -1523,9 +1523,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * event device. */ enqueue_hrtimer(timer, new_base); - - /* Clear the migration state bit */ - timer->state &= ~HRTIMER_STATE_MIGRATE; } } -- cgit v1.2.3 From 8edfb0362e8e52dec2de08fa163af01c9da2c9d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2015 14:46:45 +0200 Subject: hrtimer: Fix hrtimer_is_queued() hole A queued hrtimer that gets restarted (hrtimer_start*() while hrtimer_is_queued()) will briefly appear as unqueued/inactive, even though the timer has always been active, we just moved it. Close this hole by preserving timer->state in hrtimer_start_range_ns()'s remove_hrtimer() call. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: wanpeng.li@linux.intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.175989138@infradead.org Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index b1b795e5e0b1..1604157374d7 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -889,10 +889,10 @@ static void __remove_hrtimer(struct hrtimer *timer, * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state; + unsigned long state = timer->state; int reprogram; /* @@ -906,12 +906,15 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - /* - * We must preserve the CALLBACK state flag here, - * otherwise we could move the timer base in - * switch_hrtimer_base. - */ - state = timer->state & HRTIMER_STATE_CALLBACK; + + if (!restart) { + /* + * We must preserve the CALLBACK state flag here, + * otherwise we could move the timer base in + * switch_hrtimer_base. + */ + state &= HRTIMER_STATE_CALLBACK; + } __remove_hrtimer(timer, base, state, reprogram); return 1; } @@ -936,7 +939,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, base = lock_hrtimer_base(timer, &flags); /* Remove an active timer from the queue: */ - remove_hrtimer(timer, base); + remove_hrtimer(timer, base, true); if (mode & HRTIMER_MODE_REL) { tim = ktime_add_safe(tim, base->get_time()); @@ -1005,7 +1008,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base); + ret = remove_hrtimer(timer, base, false); unlock_hrtimer_base(timer, &flags); -- cgit v1.2.3 From 887d9dc989eb0154492e41e7c07492edbb088ba1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2015 14:46:48 +0200 Subject: hrtimer: Allow hrtimer::function() to free the timer Currently an hrtimer callback function cannot free its own timer because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK after it. Freeing the timer would result in a clear use-after-free. Solve this by using a scheme similar to regular timers; track the current running timer in hrtimer_clock_base::running. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: wanpeng.li@linux.intel.com Cc: Al Viro Cc: Linus Torvalds Cc: Paul McKenney Cc: Oleg Nesterov Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.471563047@infradead.org Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 114 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 23 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1604157374d7..f026413de4d6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -67,6 +67,7 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), + .seq = SEQCNT_ZERO(hrtimer_bases.seq), .clock_base = { { @@ -110,6 +111,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) */ #ifdef CONFIG_SMP +/* + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() + * such that hrtimer_callback_running() can unconditionally dereference + * timer->base->cpu_base + */ +static struct hrtimer_cpu_base migration_cpu_base = { + .seq = SEQCNT_ZERO(migration_cpu_base), + .clock_base = { { .cpu_base = &migration_cpu_base, }, }, +}; + +#define migration_base migration_cpu_base.clock_base[0] + /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -119,8 +132,8 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. + * possible to set timer->base = &migration_base and drop the lock: the timer + * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, @@ -130,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, for (;;) { base = timer->base; - if (likely(base != NULL)) { + if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; @@ -194,8 +207,8 @@ again: if (unlikely(hrtimer_callback_running(timer))) return base; - /* See the comment in lock_timer_base() */ - timer->base = NULL; + /* See the comment in lock_hrtimer_base() */ + timer->base = &migration_base; raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); @@ -838,11 +851,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - /* - * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the - * state of a possibly running callback. - */ - timer->state |= HRTIMER_STATE_ENQUEUED; + timer->state = HRTIMER_STATE_ENQUEUED; return timerqueue_add(&base->active, &timer->node); } @@ -907,14 +916,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - if (!restart) { - /* - * We must preserve the CALLBACK state flag here, - * otherwise we could move the timer base in - * switch_hrtimer_base. - */ - state &= HRTIMER_STATE_CALLBACK; - } + if (!restart) + state = HRTIMER_STATE_INACTIVE; + __remove_hrtimer(timer, base, state, reprogram); return 1; } @@ -1115,6 +1119,51 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); +/* + * A timer is active, when it is enqueued into the rbtree or the + * callback function is running or it's in the state of being migrated + * to another cpu. + * + * It is important for this function to not return a false negative. + */ +bool hrtimer_active(const struct hrtimer *timer) +{ + struct hrtimer_cpu_base *cpu_base; + unsigned int seq; + + do { + cpu_base = READ_ONCE(timer->base->cpu_base); + seq = raw_read_seqcount_begin(&cpu_base->seq); + + if (timer->state != HRTIMER_STATE_INACTIVE || + cpu_base->running == timer) + return true; + + } while (read_seqcount_retry(&cpu_base->seq, seq) || + cpu_base != READ_ONCE(timer->base->cpu_base)); + + return false; +} +EXPORT_SYMBOL_GPL(hrtimer_active); + +/* + * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 + * distinct sections: + * + * - queued: the timer is queued + * - callback: the timer is being ran + * - post: the timer is inactive or (re)queued + * + * On the read side we ensure we observe timer->state and cpu_base->running + * from the same section, if anything changed while we looked at it, we retry. + * This includes timer->base changing because sequence numbers alone are + * insufficient for that. + * + * The sequence numbers are required because otherwise we could still observe + * a false negative if the read side got smeared over multiple consequtive + * __run_hrtimer() invocations. + */ + static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now) @@ -1122,10 +1171,21 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, enum hrtimer_restart (*fn)(struct hrtimer *); int restart; - WARN_ON(!irqs_disabled()); + lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + cpu_base->running = timer; + + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe cpu_base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&cpu_base->seq); + + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1141,7 +1201,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, raw_spin_lock(&cpu_base->lock); /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * Note: We clear the running state after enqueue_hrtimer and * we do not reprogramm the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * @@ -1153,9 +1213,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base); - WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe cpu_base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&cpu_base->seq); - timer->state &= ~HRTIMER_STATE_CALLBACK; + WARN_ON_ONCE(cpu_base->running != timer); + cpu_base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) -- cgit v1.2.3 From bc7a34b8b9ebfb0f4b8a35a72a0b134fd6c5ef50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:33 +0000 Subject: timer: Reduce timer migration overhead if disabled Eric reported that the timer_migration sysctl is not really nice performance wise as it needs to check at every timer insertion whether the feature is enabled or not. Further the check does not live in the timer code, so we have an extra function call which checks an extra cache line to figure out that it is disabled. We can do better and store that information in the per cpu (hr)timer bases. I pondered to use a static key, but that's a nightmare to update from the nohz code and the timer base cache line is hot anyway when we select a timer base. The old logic enabled the timer migration unconditionally if CONFIG_NO_HZ was set even if nohz was disabled on the kernel command line. With this modification, we start off with migration disabled. The user visible sysctl is still set to enabled. If the kernel switches to NOHZ migration is enabled, if the user did not disable it via the sysctl prior to the switch. If nohz=off is on the kernel command line, migration stays disabled no matter what. Before: 47.76% hog [.] main 14.84% [kernel] [k] _raw_spin_lock_irqsave 9.55% [kernel] [k] _raw_spin_unlock_irqrestore 6.71% [kernel] [k] mod_timer 6.24% [kernel] [k] lock_timer_base.isra.38 3.76% [kernel] [k] detach_if_pending 3.71% [kernel] [k] del_timer 2.50% [kernel] [k] internal_add_timer 1.51% [kernel] [k] get_nohz_timer_target 1.28% [kernel] [k] __internal_add_timer 0.78% [kernel] [k] timerfn 0.48% [kernel] [k] wake_up_nohz_cpu After: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu Reported-by: Eric Dumazet Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Viresh Kumar Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f026413de4d6..6115f4df119b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -177,6 +177,24 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + if (pinned || !base->migration_enabled) + return this_cpu_ptr(&hrtimer_bases); + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +} +#else +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + return this_cpu_ptr(&hrtimer_bases); +} +#endif + /* * Switch the timer base to the current CPU when possible. */ @@ -184,14 +202,13 @@ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { + struct hrtimer_cpu_base *new_cpu_base, *this_base; struct hrtimer_clock_base *new_base; - struct hrtimer_cpu_base *new_cpu_base; - int this_cpu = smp_processor_id(); - int cpu = get_nohz_timer_target(pinned); int basenum = base->index; + this_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(this_base, pinned); again: - new_cpu_base = &per_cpu(hrtimer_bases, cpu); new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { @@ -212,17 +229,19 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); + new_cpu_base = this_base; timer->base = base; goto again; } timer->base = new_base; } else { - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { + new_cpu_base = this_base; goto again; } } -- cgit v1.2.3 From 683be13a284720205228e29207ef11a1c3c322b9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:35 +0000 Subject: timer: Minimize nohz off overhead If nohz is disabled on the kernel command line the [hr]timer code still calls wake_up_nohz_cpu() and tick_nohz_full_cpu(), a pretty pointless exercise. Cache nohz_active in [hr]timer per cpu bases and avoid the overhead. Before: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu After: 48.73% hog [.] main 15.36% [kernel] [k] _raw_spin_lock_irqsave 9.77% [kernel] [k] _raw_spin_unlock_irqrestore 6.61% [kernel] [k] lock_timer_base.isra.38 6.42% [kernel] [k] mod_timer 3.90% [kernel] [k] detach_if_pending 3.76% [kernel] [k] del_timer 2.41% [kernel] [k] internal_add_timer 1.39% [kernel] [k] __internal_add_timer 0.76% [kernel] [k] timerfn We probably should have a cached value for nohz full in the per cpu bases as well to avoid the cpumask check. The base cache line is hot already, the cpumask not necessarily. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Eric Dumazet Cc: Viresh Kumar Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Link: http://lkml.kernel.org/r/20150526224512.207378134@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/time/hrtimer.c') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 6115f4df119b..db5c9508ed95 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -994,7 +994,8 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * Kick to reschedule the next tick to handle the new timer * on dynticks target. */ - wake_up_nohz_cpu(new_base->cpu_base->cpu); + if (new_base->cpu_base->nohz_active) + wake_up_nohz_cpu(new_base->cpu_base->cpu); } else { hrtimer_reprogram(timer, new_base); } -- cgit v1.2.3