diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/time/alarmtimer.c | 72 | ||||
| -rw-r--r-- | kernel/time/clocksource.c | 9 | ||||
| -rw-r--r-- | kernel/time/hrtimer.c | 152 | ||||
| -rw-r--r-- | kernel/time/jiffies.c | 11 | ||||
| -rw-r--r-- | kernel/time/namespace.c | 2 | ||||
| -rw-r--r-- | kernel/time/posix-cpu-timers.c | 19 | ||||
| -rw-r--r-- | kernel/time/posix-timers.c | 35 | ||||
| -rw-r--r-- | kernel/time/posix-timers.h | 4 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 3 | ||||
| -rw-r--r-- | kernel/time/timer.c | 2 | ||||
| -rw-r--r-- | kernel/time/timer_migration.c | 241 | ||||
| -rw-r--r-- | kernel/time/timer_migration.h | 36 |
12 files changed, 419 insertions, 167 deletions
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 6e173d70d825..ea5be5870e51 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -337,48 +337,32 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, EXPORT_SYMBOL_GPL(alarm_init); /** - * alarm_start - Sets an absolute alarm to fire - * @alarm: ptr to alarm to set - * @start: time to run the alarm + * alarm_start_timer - Sets an alarm to fire + * @alarm: Pointer to alarm to set + * @expires: Expiry time + * @relative: True if @expires is relative + * + * Returns: True if the alarm was queued. False if it already expired */ -void alarm_start(struct alarm *alarm, ktime_t start) +bool alarm_start_timer(struct alarm *alarm, ktime_t expires, bool relative) { struct alarm_base *base = &alarm_bases[alarm->type]; - scoped_guard(spinlock_irqsave, &base->lock) { - alarm->node.expires = start; - alarmtimer_enqueue(base, alarm); - hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); - } + if (relative) + expires = ktime_add_safe(expires, base->get_ktime()); trace_alarmtimer_start(alarm, base->get_ktime()); -} -EXPORT_SYMBOL_GPL(alarm_start); - -/** - * alarm_start_relative - Sets a relative alarm to fire - * @alarm: ptr to alarm to set - * @start: time relative to now to run the alarm - */ -void alarm_start_relative(struct alarm *alarm, ktime_t start) -{ - struct alarm_base *base = &alarm_bases[alarm->type]; - - start = ktime_add_safe(start, base->get_ktime()); - alarm_start(alarm, start); -} -EXPORT_SYMBOL_GPL(alarm_start_relative); - -void alarm_restart(struct alarm *alarm) -{ - struct alarm_base *base = &alarm_bases[alarm->type]; guard(spinlock_irqsave)(&base->lock); - hrtimer_set_expires(&alarm->timer, alarm->node.expires); - hrtimer_restart(&alarm->timer); + alarm->node.expires = expires; alarmtimer_enqueue(base, alarm); + if (!hrtimer_start_range_ns_user(&alarm->timer, expires, 0, HRTIMER_MODE_ABS)) { + alarmtimer_dequeue(base, alarm); + return false; + } + return true; } -EXPORT_SYMBOL_GPL(alarm_restart); +EXPORT_SYMBOL_GPL(alarm_start_timer); /** * alarm_try_to_cancel - Tries to cancel an alarm timer @@ -512,8 +496,6 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) * @now: time at the timer expiration * * Posix timer callback for expired alarm timers. - * - * Return: whether the timer is to be restarted */ static void alarm_handle_timer(struct alarm *alarm, ktime_t now) { @@ -527,12 +509,12 @@ static void alarm_handle_timer(struct alarm *alarm, ktime_t now) * alarm_timer_rearm - Posix timer callback for rearming timer * @timr: Pointer to the posixtimer data struct */ -static void alarm_timer_rearm(struct k_itimer *timr) +static bool alarm_timer_rearm(struct k_itimer *timr) { struct alarm *alarm = &timr->it.alarm.alarmtimer; timr->it_overrun += alarm_forward_now(alarm, timr->it_interval); - alarm_start(alarm, alarm->node.expires); + return alarm_start_timer(alarm, alarm->node.expires, false); } /** @@ -588,7 +570,7 @@ static void alarm_timer_wait_running(struct k_itimer *timr) * @absolute: Expiry value is absolute time * @sigev_none: Posix timer does not deliver signals */ -static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, +static bool alarm_timer_arm(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none) { struct alarm *alarm = &timr->it.alarm.alarmtimer; @@ -596,10 +578,16 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, if (!absolute) expires = ktime_add_safe(expires, base->get_ktime()); - if (sigev_none) + + /* + * sigev_none needs to update the expires value and pretend + * that the timer is queued + */ + if (sigev_none) { alarm->node.expires = expires; - else - alarm_start(&timr->it.alarm.alarmtimer, expires); + return true; + } + return alarm_start_timer(&timr->it.alarm.alarmtimer, expires, false); } /** @@ -706,7 +694,9 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, alarm->data = (void *)current; do { set_current_state(TASK_INTERRUPTIBLE); - alarm_start(alarm, absexp); + if (!alarm_start_timer(alarm, absexp, false)) + alarm->data = NULL; + if (likely(alarm->data)) schedule(); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 313f6c88148e..e48c4d379a7c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1222,14 +1222,8 @@ static void clocksource_enqueue(struct clocksource *cs) * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale - * - * This should only be called from the clocksource->enable() method. - * - * This *SHOULD NOT* be called directly! Please use the - * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper - * functions. */ -void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) +static void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; @@ -1287,7 +1281,6 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } -EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); /** * __clocksource_register_scale - Used to install new clocksources diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5bd6efe598f0..638ce623c342 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1352,8 +1352,14 @@ static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool return hrtimer_prefer_local(is_local, is_first, is_pinned); } -static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, - const enum hrtimer_mode mode, struct hrtimer_clock_base *base) +enum { + HRTIMER_REPROGRAM_NONE, + HRTIMER_REPROGRAM, + HRTIMER_REPROGRAM_FORCE, +}; + +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, + const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); bool is_pinned, first, was_first, keep_base = false; @@ -1410,7 +1416,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del /* If a deferred rearm is pending skip reprogramming the device */ if (cpu_base->deferred_rearm) { cpu_base->deferred_needs_update = true; - return false; + return HRTIMER_REPROGRAM_NONE; } if (!was_first || cpu_base != this_cpu_base) { @@ -1423,7 +1429,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del * callbacks. */ if (likely(hrtimer_base_is_online(this_cpu_base))) - return first; + return first ? HRTIMER_REPROGRAM : HRTIMER_REPROGRAM_NONE; /* * Timer was enqueued remote because the current base is @@ -1432,7 +1438,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del */ if (first) smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd); - return false; + return HRTIMER_REPROGRAM_NONE; } /* @@ -1446,7 +1452,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del */ if (timer->is_lazy) { if (cpu_base->expires_next <= hrtimer_get_expires(timer)) - return false; + return HRTIMER_REPROGRAM_NONE; } /* @@ -1455,8 +1461,24 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del * reprogram the hardware by evaluating the new first expiring * timer. */ - hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); - return false; + return HRTIMER_REPROGRAM_FORCE; +} + +static int hrtimer_start_range_ns_common(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode, + struct hrtimer_clock_base *base) +{ + /* + * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft + * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard + * expiry mode because unmarked timers are moved to softirq expiry. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + else + WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); + + return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, base); } /** @@ -1476,24 +1498,104 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, debug_hrtimer_assert_init(timer); + base = lock_hrtimer_base(timer, &flags); + + switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) { + case HRTIMER_REPROGRAM: + hrtimer_reprogram(timer, true); + break; + case HRTIMER_REPROGRAM_FORCE: + hrtimer_force_reprogram(timer->base->cpu_base, 1); + break; + case HRTIMER_REPROGRAM_NONE: + break; + } + + unlock_hrtimer_base(timer, &flags); +} +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); + +static inline bool hrtimer_check_user_timer(struct hrtimer *timer) +{ + struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base; + ktime_t expires; + /* - * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft - * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard - * expiry mode because unmarked timers are moved to softirq expiry. + * This uses soft expires because that's the user provided + * expiry time, while expires can be further in the past + * due to a slack value added to the user expiry time. */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); - else - WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); + expires = hrtimer_get_softexpires(timer); + + /* Convert to monotonic */ + expires = ktime_sub(expires, timer->base->offset); + + /* + * Check whether this timer will end up as the first expiring timer in + * the CPU base. If not, no further checks required as it's then + * guaranteed to expire in the future. + */ + if (expires >= cpu_base->expires_next) + return true; + + /* Validate that the expiry time is in the future. */ + if (expires > ktime_get()) + return true; + + debug_hrtimer_deactivate(timer); + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_INACTIVE, false); + trace_hrtimer_start_expired(timer); + return false; +} + +/** + * hrtimer_start_range_ns_user - (re)start an user controlled hrtimer + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); + * softirq based mode is considered for debug purpose only! + * + * Returns: True when the timer was queued, false if it was already expired + * + * This function cannot invoke the timer callback for expired timers as it might + * be called under a lock which the timer callback needs to acquire. So the + * caller has to handle that case. + */ +bool hrtimer_start_range_ns_user(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + bool ret = true; + + debug_hrtimer_assert_init(timer); base = lock_hrtimer_base(timer, &flags); - if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) - hrtimer_reprogram(timer, true); + switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) { + case HRTIMER_REPROGRAM: + ret = hrtimer_check_user_timer(timer); + if (ret) + hrtimer_reprogram(timer, true); + break; + case HRTIMER_REPROGRAM_FORCE: + ret = hrtimer_check_user_timer(timer); + /* + * The base must always be reevaluated, independent of the + * result above because the timer was the first pending timer. + */ + hrtimer_force_reprogram(timer->base->cpu_base, 1); + break; + case HRTIMER_REPROGRAM_NONE: + break; + } unlock_hrtimer_base(timer, &flags); + return ret; } -EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns_user); /** * hrtimer_try_to_cancel - try to deactivate a timer @@ -1681,10 +1783,10 @@ EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ -u64 hrtimer_get_next_event(void) +ktime_t hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - u64 expires = KTIME_MAX; + ktime_t expires = KTIME_MAX; guard(raw_spinlock_irqsave)(&cpu_base->lock); if (!hrtimer_hres_active(cpu_base)) @@ -1700,10 +1802,10 @@ u64 hrtimer_get_next_event(void) * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ -u64 hrtimer_next_event_without(const struct hrtimer *exclude) +ktime_t hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - u64 expires = KTIME_MAX; + ktime_t expires = KTIME_MAX; unsigned int active; guard(raw_spinlock_irqsave)(&cpu_base->lock); @@ -2213,7 +2315,11 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) mode |= HRTIMER_MODE_HARD; - hrtimer_start_expires(&sl->timer, mode); + /* If already expired, clear the task pointer and set current state to running */ + if (!hrtimer_start_expires_user(&sl->timer, mode)) { + sl->task = NULL; + __set_current_state(TASK_RUNNING); + } } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 1c954f330dfe..d51428867a33 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -60,15 +60,14 @@ EXPORT_SYMBOL(get_jiffies_64); EXPORT_SYMBOL(jiffies); -static int __init init_jiffies_clocksource(void) -{ - return __clocksource_register(&clocksource_jiffies); -} - -core_initcall(init_jiffies_clocksource); +static bool cs_jiffies_registered __initdata; struct clocksource * __init __weak clocksource_default_clock(void) { + if (!cs_jiffies_registered) { + __clocksource_register(&clocksource_jiffies); + cs_jiffies_registered = true; + } return &clocksource_jiffies; } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 4bca3f78c8ea..5fa0af66cf3f 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -57,6 +57,7 @@ ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, return tim; } +EXPORT_SYMBOL_GPL(do_timens_ktime_to_host); static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { @@ -351,6 +352,7 @@ struct time_namespace init_time_ns = { .user_ns = &init_user_ns, .frozen_offsets = true, }; +EXPORT_SYMBOL_GPL(init_time_ns); void __init time_ns_init(void) { diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0de2bb7cbec0..74775b94d11b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -19,7 +19,7 @@ #include "posix-timers.h" -static void posix_cpu_timer_rearm(struct k_itimer *timer); +static bool posix_cpu_timer_rearm(struct k_itimer *timer); void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { @@ -1011,24 +1011,27 @@ static void check_process_timers(struct task_struct *tsk, /* * This is called from the signal code (via posixtimer_rearm) * when the last timer signal was delivered and we have to reload the timer. + * + * Return true unconditionally so the core code assumes the timer to be + * armed. Otherwise it would requeue the signal. */ -static void posix_cpu_timer_rearm(struct k_itimer *timer) +static bool posix_cpu_timer_rearm(struct k_itimer *timer) { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - struct task_struct *p; struct sighand_struct *sighand; + struct task_struct *p; unsigned long flags; u64 now; - rcu_read_lock(); + guard(rcu)(); p = cpu_timer_task_rcu(timer); if (!p) - goto out; + return true; /* Protect timer list r/w in arm_timer() */ sighand = lock_task_sighand(p, &flags); if (unlikely(sighand == NULL)) - goto out; + return true; /* * Fetch the current sample and update the timer's expiry time. @@ -1045,8 +1048,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) */ arm_timer(timer, p); unlock_task_sighand(p, &flags); -out: - rcu_read_unlock(); + return true; } /** @@ -1504,6 +1506,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, spin_lock_irq(&timer.it_lock); error = posix_cpu_timer_set(&timer, flags, &it, NULL); if (error) { + posix_cpu_timer_del(&timer); spin_unlock_irq(&timer.it_lock); return error; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 9331e1614124..436ba794cc0b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -288,16 +288,18 @@ static inline int timer_overrun_to_int(struct k_itimer *timr) return (int)timr->it_overrun_last; } -static void common_hrtimer_rearm(struct k_itimer *timr) +static bool common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval); - hrtimer_restart(timer); + return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS); } static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr) { + bool queued; + guard(spinlock)(&timr->it_lock); /* @@ -311,12 +313,18 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) return true; - timr->kclock->timer_rearm(timr); - timr->it_status = POSIX_TIMER_ARMED; + /* timer_rearm() updates timr::it_overrun */ + queued = timr->kclock->timer_rearm(timr); + timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1LL; ++timr->it_signal_seq; info->si_overrun = timer_overrun_to_int(timr); + + if (queued) + timr->it_status = POSIX_TIMER_ARMED; + else + posix_timer_queue_signal(timr); return true; } @@ -795,7 +803,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) return timer_overrun_to_int(scoped_timer); } -static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, +static bool common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none) { struct hrtimer *timer = &timr->it.real.timer; @@ -820,8 +828,11 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer)); hrtimer_set_expires(timer, expires); - if (!sigev_none) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + /* For sigev_none pretend that the timer is queued */ + if (sigev_none) + return true; + + return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS); } static int common_hrtimer_try_to_cancel(struct k_itimer *timr) @@ -903,9 +914,13 @@ int common_timer_set(struct k_itimer *timr, int flags, expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; - kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); - if (!sigev_none) - timr->it_status = POSIX_TIMER_ARMED; + if (kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none)) { + if (!sigev_none) + timr->it_status = POSIX_TIMER_ARMED; + } else { + /* Timer was already expired, queue the signal */ + posix_timer_queue_signal(timr); + } return 0; } diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 7f259e845d24..4ea9611dd716 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -27,11 +27,11 @@ struct k_clock { int (*timer_del)(struct k_itimer *timr); void (*timer_get)(struct k_itimer *timr, struct itimerspec64 *cur_setting); - void (*timer_rearm)(struct k_itimer *timr); + bool (*timer_rearm)(struct k_itimer *timr); s64 (*timer_forward)(struct k_itimer *timr, ktime_t now); ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); int (*timer_try_to_cancel)(struct k_itimer *timr); - void (*timer_arm)(struct k_itimer *timr, ktime_t expires, + bool (*timer_arm)(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none); void (*timer_wait_running)(struct k_itimer *timr); }; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cbbb87a0c6e7..3026a301dff7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1407,8 +1407,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ - next_event = min_t(u64, next_event, - hrtimer_next_event_without(&ts->sched_timer)); + next_event = min(next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 04d928c21aba..655a8c6cd84d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1932,7 +1932,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) */ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { - u64 nextevt = hrtimer_get_next_event(); + u64 nextevt = ktime_to_ns(hrtimer_get_next_event()); /* * If high resolution timers are enabled diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 52c15affdbff..806c23cf71fc 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -102,7 +102,7 @@ * active CPU/group information atomic_try_cmpxchg() is used instead and only * the per CPU tmigr_cpu->lock is held. * - * During the setup of groups tmigr_level_list is required. It is protected by + * During the setup of groups, hier->level_list is required. It is protected by * @tmigr_mutex. * * When @timer_base->lock as well as tmigr related locks are required, the lock @@ -416,13 +416,12 @@ */ static DEFINE_MUTEX(tmigr_mutex); -static struct list_head *tmigr_level_list __read_mostly; + +static LIST_HEAD(tmigr_hierarchy_list); static unsigned int tmigr_hierarchy_levels __read_mostly; static unsigned int tmigr_crossnode_level __read_mostly; -static struct tmigr_group *tmigr_root; - static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); /* @@ -1469,6 +1468,34 @@ static long tmigr_trigger_active(void *unused) return 0; } +static unsigned int tmigr_get_capacity(int cpu) +{ + /* + * nohz_full CPUs need to make sure there is always an available (online) + * and never idle migrator to handle all their global timers. That duty + * is served by the timekeeper which then never stops its tick. But the + * timekeeper must then belong to the same hierarchy as all the nohz_full + * CPUs. Simply turn off capacity awareness when nohz_full is running. + */ + if (tick_nohz_full_enabled() || !IS_ENABLED(CONFIG_BROKEN)) + return SCHED_CAPACITY_SCALE; + else + return arch_scale_cpu_capacity(cpu); +} + +static struct tmigr_hierarchy *__tmigr_get_hierarchy(int cpu) +{ + unsigned int capacity = tmigr_get_capacity(cpu); + struct tmigr_hierarchy *iter; + + list_for_each_entry(iter, &tmigr_hierarchy_list, node) { + if (iter->capacity == capacity) + return iter; + } + + return NULL; +} + static int tmigr_clear_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); @@ -1493,8 +1520,21 @@ static int tmigr_clear_cpu_available(unsigned int cpu) } if (firstexp != KTIME_MAX) { - migrator = cpumask_any(tmigr_available_cpumask); - work_on_cpu(migrator, tmigr_trigger_active, NULL); + struct tmigr_hierarchy *hier = __tmigr_get_hierarchy(cpu); + + if (WARN_ON_ONCE(!hier)) + return -EINVAL; + + migrator = cpumask_any_and(tmigr_available_cpumask, hier->cpumask); + if (migrator < nr_cpu_ids) { + work_on_cpu(migrator, tmigr_trigger_active, NULL); + } else { + /* + * If deactivation returned an expiration, it belongs to an available + * nohz CPU in the hierarchy. + */ + WARN_ONCE(1, "Expected available CPU in the hierarchy\n"); + } } return 0; @@ -1657,14 +1697,14 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, group->groupevt.ignore = true; } -static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) +static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int node, unsigned int lvl) { struct tmigr_group *tmp, *group = NULL; lockdep_assert_held(&tmigr_mutex); /* Try to attach to an existing group first */ - list_for_each_entry(tmp, &tmigr_level_list[lvl], list) { + list_for_each_entry(tmp, &hier->level_list[lvl], list) { /* * If @lvl is below the cross NUMA node level, check whether * this group belongs to the same NUMA node. @@ -1698,14 +1738,14 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) tmigr_init_group(group, lvl, node); /* Setup successful. Add it to the hierarchy */ - list_add(&group->list, &tmigr_level_list[lvl]); + list_add(&group->list, &hier->level_list[lvl]); trace_tmigr_group_set(group); return group; } -static bool tmigr_init_root(struct tmigr_group *group, bool activate) +static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *group, bool activate) { - if (!group->parent && group != tmigr_root) { + if (!group->parent && group != hier->root) { /* * This is the new top-level, prepare its groupmask in advance * to avoid accidents where yet another new top-level is @@ -1721,11 +1761,10 @@ static bool tmigr_init_root(struct tmigr_group *group, bool activate) } -static void tmigr_connect_child_parent(struct tmigr_group *child, - struct tmigr_group *parent, - bool activate) +static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmigr_group *child, + struct tmigr_group *parent, bool activate) { - if (tmigr_init_root(parent, activate)) { + if (tmigr_init_root(hier, parent, activate)) { /* * The previous top level had prepared its groupmask already, * simply account it in advance as the first child. If some groups @@ -1758,13 +1797,13 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, */ smp_store_release(&child->parent, parent); - trace_tmigr_connect_child_parent(child); + trace_tmigr_connect_child_parent(hier, child); } -static int tmigr_setup_groups(unsigned int cpu, unsigned int node, - struct tmigr_group *start, bool activate) +static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu, + unsigned int node, struct tmigr_group *start, bool activate) { - struct tmigr_group *group, *child, **stack; + struct tmigr_group *root = hier->root, *group, *child, **stack; int i, top = 0, err = 0, start_lvl = 0; bool root_mismatch = false; @@ -1777,11 +1816,11 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, start_lvl = start->level + 1; } - if (tmigr_root) - root_mismatch = tmigr_root->numa_node != node; + if (root) + root_mismatch = root->numa_node != node; for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { - group = tmigr_get_group(node, i); + group = tmigr_get_group(hier, node, i); if (IS_ERR(group)) { err = PTR_ERR(group); i--; @@ -1803,7 +1842,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, if (group->parent) break; if ((!root_mismatch || i >= tmigr_crossnode_level) && - list_is_singular(&tmigr_level_list[i])) + list_is_singular(&hier->level_list[i])) break; } @@ -1831,15 +1870,15 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, tmc->tmgroup = group; tmc->groupmask = BIT(group->num_children++); - tmigr_init_root(group, activate); + tmigr_init_root(hier, group, activate); - trace_tmigr_connect_cpu_parent(tmc); + trace_tmigr_connect_cpu_parent(hier, tmc); /* There are no children that need to be connected */ continue; } else { child = stack[i - 1]; - tmigr_connect_child_parent(child, group, activate); + tmigr_connect_child_parent(hier, child, group, activate); } } @@ -1895,18 +1934,23 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, data.childmask = start->groupmask; __walk_groups_from(tmigr_active_up, &data, start, start->parent); } + } else if (start) { + union tmigr_state state; + + /* Remote activation assumes the whole target's hierarchy is inactive */ + state.state = atomic_read(&start->migr_state); + WARN_ON_ONCE(state.active); } /* Root update */ - if (list_is_singular(&tmigr_level_list[top])) { - group = list_first_entry(&tmigr_level_list[top], - typeof(*group), list); + if (list_is_singular(&hier->level_list[top])) { + group = list_first_entry(&hier->level_list[top], typeof(*group), list); WARN_ON_ONCE(group->parent); - if (tmigr_root) { + if (root) { /* Old root should be the same or below */ - WARN_ON_ONCE(tmigr_root->level > top); + WARN_ON_ONCE(root->level > top); } - tmigr_root = group; + hier->root = group; } out: kfree(stack); @@ -1914,34 +1958,123 @@ out: return err; } +static struct tmigr_hierarchy *tmigr_get_hierarchy(int cpu) +{ + struct tmigr_hierarchy *hier; + + hier = __tmigr_get_hierarchy(cpu); + + if (hier) + return hier; + + hier = kzalloc_flex(*hier, level_list, tmigr_hierarchy_levels); + if (!hier) + return ERR_PTR(-ENOMEM); + + hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!hier->cpumask) { + kfree(hier); + return ERR_PTR(-ENOMEM); + } + + for (int i = 0; i < tmigr_hierarchy_levels; i++) + INIT_LIST_HEAD(&hier->level_list[i]); + + hier->capacity = tmigr_get_capacity(cpu); + list_add_tail(&hier->node, &tmigr_hierarchy_list); + + return hier; +} + +static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu, + struct tmigr_group *old_root, bool activate) +{ + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == smp_processor_id()); + if (activate) { + /* + * The current CPU is expected to be online in the hierarchy, + * otherwise the old root may not be active as expected. + */ + WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available)); + } + + return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate); +} + +static long connect_old_root_work(void *arg) +{ + struct tmigr_group *old_root = arg; + struct tmigr_hierarchy *hier; + int cpu = smp_processor_id(); + + hier = __tmigr_get_hierarchy(cpu); + if (WARN_ON_ONCE(!hier)) + return -EINVAL; + + return tmigr_connect_old_root(hier, cpu, old_root, true); +} + static int tmigr_add_cpu(unsigned int cpu) { - struct tmigr_group *old_root = tmigr_root; + struct tmigr_hierarchy *hier; + struct tmigr_group *old_root; int node = cpu_to_node(cpu); int ret; guard(mutex)(&tmigr_mutex); - ret = tmigr_setup_groups(cpu, node, NULL, false); + hier = tmigr_get_hierarchy(cpu); + if (IS_ERR(hier)) + return PTR_ERR(hier); + + old_root = hier->root; + + ret = tmigr_setup_groups(hier, cpu, node, NULL, false); + + if (ret < 0) + return ret; /* Root has changed? Connect the old one to the new */ - if (ret >= 0 && old_root && old_root != tmigr_root) { - /* - * The target CPU must never do the prepare work, except - * on early boot when the boot CPU is the target. Otherwise - * it may spuriously activate the old top level group inside - * the new one (nevertheless whether old top level group is - * active or not) and/or release an uninitialized childmask. - */ - WARN_ON_ONCE(cpu == raw_smp_processor_id()); - /* - * The (likely) current CPU is expected to be online in the hierarchy, - * otherwise the old root may not be active as expected. - */ - WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available); - ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true); + if (old_root && old_root != hier->root) { + guard(migrate)(); + + if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) { + /* + * If the target belong to the same hierarchy, the old root is expected + * to be active. Link and propagate to the new root. + */ + ret = tmigr_connect_old_root(hier, cpu, old_root, true); + } else { + int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask); + + if (target < nr_cpu_ids) { + /* + * If the target doesn't belong to the same hierarchy as the current + * CPU, activate from a relevant one to make sure the old root is + * active. + */ + ret = work_on_cpu(target, connect_old_root_work, old_root); + } else { + /* + * No other available CPUs in the remote hierarchy. Link the + * old root remotely but don't propagate activation since the + * old root is not expected to be active. + */ + ret = tmigr_connect_old_root(hier, cpu, old_root, false); + } + } } + if (ret >= 0) + cpumask_set_cpu(cpu, hier->cpumask); + return ret; } @@ -1974,7 +2107,7 @@ static int tmigr_cpu_prepare(unsigned int cpu) static int __init tmigr_init(void) { - unsigned int cpulvl, nodelvl, cpus_per_node, i; + unsigned int cpulvl, nodelvl, cpus_per_node; unsigned int nnodes = num_possible_nodes(); unsigned int ncpus = num_possible_cpus(); int ret = -ENOMEM; @@ -2021,14 +2154,6 @@ static int __init tmigr_init(void) */ tmigr_crossnode_level = cpulvl; - tmigr_level_list = kzalloc_objs(struct list_head, - tmigr_hierarchy_levels); - if (!tmigr_level_list) - goto err; - - for (i = 0; i < tmigr_hierarchy_levels; i++) - INIT_LIST_HEAD(&tmigr_level_list[i]); - pr_info("Timer migration: %d hierarchy levels; %d children per group;" " %d crossnode level\n", tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 70879cde6fdd..31735dd52327 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -6,6 +6,24 @@ #define TMIGR_CHILDREN_PER_GROUP 8 /** + * struct tmigr_hierarchy - a hierarchy associated to a given CPU capacity. + * Homogeneous systems have only one hierarchy. + * Heterogenous have one hierarchy per CPU capacity. + * @cpumask: CPUs belonging to this hierarchy + * @root: The current root of the hierarchy + * @capacity: CPU capacity associated to this hierarchy + * @node: Node in the global hierarchy list + * @level_list: Per level lists of tmigr groups + */ +struct tmigr_hierarchy { + struct cpumask *cpumask; + struct tmigr_group *root; + unsigned long capacity; + struct list_head node; + struct list_head level_list[]; +}; + +/** * struct tmigr_event - a timer event associated to a CPU * @nextevt: The node to enqueue an event in the parent group queue * @cpu: The CPU to which this event belongs @@ -75,15 +93,17 @@ struct tmigr_group { /** * struct tmigr_cpu - timer migration per CPU group * @lock: Lock protecting the tmigr_cpu group information - * @online: Indicates whether the CPU is online; In deactivate path - * it is required to know whether the migrator in the top - * level group is to be set offline, while a timer is - * pending. Then another online CPU needs to be notified to - * take over the migrator role. Furthermore the information - * is required in CPU hotplug path as the CPU is able to go - * idle before the timer migration hierarchy hotplug AP is - * reached. During this phase, the CPU has to handle the + * @available: Indicates whether the CPU is available for handling + * global timers. In the deactivate path it is required to + * know whether the migrator in the top level group is to + * be set offline, while a timer is pending. Then another + * available CPU needs to be notified to take over the + * migrator role. Furthermore the information is required + * in the CPU hotplug path as the CPU is able to go idle + * before the timer migration hierarchy hotplug callback is + * reached. During this phase, the CPU has to handle the * global timers on its own and must not act as a migrator. + * @idle: Indicates whether the CPU is idle in the timer migration * hierarchy * @remote: Is set when timers of the CPU are expired remotely |
