summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/time/alarmtimer.c72
-rw-r--r--kernel/time/clocksource.c9
-rw-r--r--kernel/time/hrtimer.c152
-rw-r--r--kernel/time/jiffies.c11
-rw-r--r--kernel/time/namespace.c2
-rw-r--r--kernel/time/posix-cpu-timers.c19
-rw-r--r--kernel/time/posix-timers.c35
-rw-r--r--kernel/time/posix-timers.h4
-rw-r--r--kernel/time/tick-sched.c3
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/time/timer_migration.c241
-rw-r--r--kernel/time/timer_migration.h36
12 files changed, 419 insertions, 167 deletions
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 6e173d70d825..ea5be5870e51 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -337,48 +337,32 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
EXPORT_SYMBOL_GPL(alarm_init);
/**
- * alarm_start - Sets an absolute alarm to fire
- * @alarm: ptr to alarm to set
- * @start: time to run the alarm
+ * alarm_start_timer - Sets an alarm to fire
+ * @alarm: Pointer to alarm to set
+ * @expires: Expiry time
+ * @relative: True if @expires is relative
+ *
+ * Returns: True if the alarm was queued. False if it already expired
*/
-void alarm_start(struct alarm *alarm, ktime_t start)
+bool alarm_start_timer(struct alarm *alarm, ktime_t expires, bool relative)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- scoped_guard(spinlock_irqsave, &base->lock) {
- alarm->node.expires = start;
- alarmtimer_enqueue(base, alarm);
- hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
- }
+ if (relative)
+ expires = ktime_add_safe(expires, base->get_ktime());
trace_alarmtimer_start(alarm, base->get_ktime());
-}
-EXPORT_SYMBOL_GPL(alarm_start);
-
-/**
- * alarm_start_relative - Sets a relative alarm to fire
- * @alarm: ptr to alarm to set
- * @start: time relative to now to run the alarm
- */
-void alarm_start_relative(struct alarm *alarm, ktime_t start)
-{
- struct alarm_base *base = &alarm_bases[alarm->type];
-
- start = ktime_add_safe(start, base->get_ktime());
- alarm_start(alarm, start);
-}
-EXPORT_SYMBOL_GPL(alarm_start_relative);
-
-void alarm_restart(struct alarm *alarm)
-{
- struct alarm_base *base = &alarm_bases[alarm->type];
guard(spinlock_irqsave)(&base->lock);
- hrtimer_set_expires(&alarm->timer, alarm->node.expires);
- hrtimer_restart(&alarm->timer);
+ alarm->node.expires = expires;
alarmtimer_enqueue(base, alarm);
+ if (!hrtimer_start_range_ns_user(&alarm->timer, expires, 0, HRTIMER_MODE_ABS)) {
+ alarmtimer_dequeue(base, alarm);
+ return false;
+ }
+ return true;
}
-EXPORT_SYMBOL_GPL(alarm_restart);
+EXPORT_SYMBOL_GPL(alarm_start_timer);
/**
* alarm_try_to_cancel - Tries to cancel an alarm timer
@@ -512,8 +496,6 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
* @now: time at the timer expiration
*
* Posix timer callback for expired alarm timers.
- *
- * Return: whether the timer is to be restarted
*/
static void alarm_handle_timer(struct alarm *alarm, ktime_t now)
{
@@ -527,12 +509,12 @@ static void alarm_handle_timer(struct alarm *alarm, ktime_t now)
* alarm_timer_rearm - Posix timer callback for rearming timer
* @timr: Pointer to the posixtimer data struct
*/
-static void alarm_timer_rearm(struct k_itimer *timr)
+static bool alarm_timer_rearm(struct k_itimer *timr)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
timr->it_overrun += alarm_forward_now(alarm, timr->it_interval);
- alarm_start(alarm, alarm->node.expires);
+ return alarm_start_timer(alarm, alarm->node.expires, false);
}
/**
@@ -588,7 +570,7 @@ static void alarm_timer_wait_running(struct k_itimer *timr)
* @absolute: Expiry value is absolute time
* @sigev_none: Posix timer does not deliver signals
*/
-static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
+static bool alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
bool absolute, bool sigev_none)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
@@ -596,10 +578,16 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
if (!absolute)
expires = ktime_add_safe(expires, base->get_ktime());
- if (sigev_none)
+
+ /*
+ * sigev_none needs to update the expires value and pretend
+ * that the timer is queued
+ */
+ if (sigev_none) {
alarm->node.expires = expires;
- else
- alarm_start(&timr->it.alarm.alarmtimer, expires);
+ return true;
+ }
+ return alarm_start_timer(&timr->it.alarm.alarmtimer, expires, false);
}
/**
@@ -706,7 +694,9 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
alarm->data = (void *)current;
do {
set_current_state(TASK_INTERRUPTIBLE);
- alarm_start(alarm, absexp);
+ if (!alarm_start_timer(alarm, absexp, false))
+ alarm->data = NULL;
+
if (likely(alarm->data))
schedule();
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 313f6c88148e..e48c4d379a7c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -1222,14 +1222,8 @@ static void clocksource_enqueue(struct clocksource *cs)
* @cs: clocksource to be registered
* @scale: Scale factor multiplied against freq to get clocksource hz
* @freq: clocksource frequency (cycles per second) divided by scale
- *
- * This should only be called from the clocksource->enable() method.
- *
- * This *SHOULD NOT* be called directly! Please use the
- * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
- * functions.
*/
-void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
+static void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
u64 sec;
@@ -1287,7 +1281,6 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
}
-EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
/**
* __clocksource_register_scale - Used to install new clocksources
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5bd6efe598f0..638ce623c342 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1352,8 +1352,14 @@ static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool
return hrtimer_prefer_local(is_local, is_first, is_pinned);
}
-static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
- const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
+enum {
+ HRTIMER_REPROGRAM_NONE,
+ HRTIMER_REPROGRAM,
+ HRTIMER_REPROGRAM_FORCE,
+};
+
+static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+ const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
{
struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
bool is_pinned, first, was_first, keep_base = false;
@@ -1410,7 +1416,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
/* If a deferred rearm is pending skip reprogramming the device */
if (cpu_base->deferred_rearm) {
cpu_base->deferred_needs_update = true;
- return false;
+ return HRTIMER_REPROGRAM_NONE;
}
if (!was_first || cpu_base != this_cpu_base) {
@@ -1423,7 +1429,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
* callbacks.
*/
if (likely(hrtimer_base_is_online(this_cpu_base)))
- return first;
+ return first ? HRTIMER_REPROGRAM : HRTIMER_REPROGRAM_NONE;
/*
* Timer was enqueued remote because the current base is
@@ -1432,7 +1438,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
*/
if (first)
smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd);
- return false;
+ return HRTIMER_REPROGRAM_NONE;
}
/*
@@ -1446,7 +1452,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
*/
if (timer->is_lazy) {
if (cpu_base->expires_next <= hrtimer_get_expires(timer))
- return false;
+ return HRTIMER_REPROGRAM_NONE;
}
/*
@@ -1455,8 +1461,24 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
* reprogram the hardware by evaluating the new first expiring
* timer.
*/
- hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
- return false;
+ return HRTIMER_REPROGRAM_FORCE;
+}
+
+static int hrtimer_start_range_ns_common(struct hrtimer *timer, ktime_t tim,
+ u64 delta_ns, const enum hrtimer_mode mode,
+ struct hrtimer_clock_base *base)
+{
+ /*
+ * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
+ * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
+ * expiry mode because unmarked timers are moved to softirq expiry.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+ else
+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
+
+ return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, base);
}
/**
@@ -1476,24 +1498,104 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
debug_hrtimer_assert_init(timer);
+ base = lock_hrtimer_base(timer, &flags);
+
+ switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) {
+ case HRTIMER_REPROGRAM:
+ hrtimer_reprogram(timer, true);
+ break;
+ case HRTIMER_REPROGRAM_FORCE:
+ hrtimer_force_reprogram(timer->base->cpu_base, 1);
+ break;
+ case HRTIMER_REPROGRAM_NONE:
+ break;
+ }
+
+ unlock_hrtimer_base(timer, &flags);
+}
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+static inline bool hrtimer_check_user_timer(struct hrtimer *timer)
+{
+ struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base;
+ ktime_t expires;
+
/*
- * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
- * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
- * expiry mode because unmarked timers are moved to softirq expiry.
+ * This uses soft expires because that's the user provided
+ * expiry time, while expires can be further in the past
+ * due to a slack value added to the user expiry time.
*/
- if (!IS_ENABLED(CONFIG_PREEMPT_RT))
- WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
- else
- WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
+ expires = hrtimer_get_softexpires(timer);
+
+ /* Convert to monotonic */
+ expires = ktime_sub(expires, timer->base->offset);
+
+ /*
+ * Check whether this timer will end up as the first expiring timer in
+ * the CPU base. If not, no further checks required as it's then
+ * guaranteed to expire in the future.
+ */
+ if (expires >= cpu_base->expires_next)
+ return true;
+
+ /* Validate that the expiry time is in the future. */
+ if (expires > ktime_get())
+ return true;
+
+ debug_hrtimer_deactivate(timer);
+ __remove_hrtimer(timer, timer->base, HRTIMER_STATE_INACTIVE, false);
+ trace_hrtimer_start_expired(timer);
+ return false;
+}
+
+/**
+ * hrtimer_start_range_ns_user - (re)start an user controlled hrtimer
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
+ * softirq based mode is considered for debug purpose only!
+ *
+ * Returns: True when the timer was queued, false if it was already expired
+ *
+ * This function cannot invoke the timer callback for expired timers as it might
+ * be called under a lock which the timer callback needs to acquire. So the
+ * caller has to handle that case.
+ */
+bool hrtimer_start_range_ns_user(struct hrtimer *timer, ktime_t tim,
+ u64 delta_ns, const enum hrtimer_mode mode)
+{
+ struct hrtimer_clock_base *base;
+ unsigned long flags;
+ bool ret = true;
+
+ debug_hrtimer_assert_init(timer);
base = lock_hrtimer_base(timer, &flags);
- if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
- hrtimer_reprogram(timer, true);
+ switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) {
+ case HRTIMER_REPROGRAM:
+ ret = hrtimer_check_user_timer(timer);
+ if (ret)
+ hrtimer_reprogram(timer, true);
+ break;
+ case HRTIMER_REPROGRAM_FORCE:
+ ret = hrtimer_check_user_timer(timer);
+ /*
+ * The base must always be reevaluated, independent of the
+ * result above because the timer was the first pending timer.
+ */
+ hrtimer_force_reprogram(timer->base->cpu_base, 1);
+ break;
+ case HRTIMER_REPROGRAM_NONE:
+ break;
+ }
unlock_hrtimer_base(timer, &flags);
+ return ret;
}
-EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns_user);
/**
* hrtimer_try_to_cancel - try to deactivate a timer
@@ -1681,10 +1783,10 @@ EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
*
* Returns the next expiry time or KTIME_MAX if no timer is pending.
*/
-u64 hrtimer_get_next_event(void)
+ktime_t hrtimer_get_next_event(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- u64 expires = KTIME_MAX;
+ ktime_t expires = KTIME_MAX;
guard(raw_spinlock_irqsave)(&cpu_base->lock);
if (!hrtimer_hres_active(cpu_base))
@@ -1700,10 +1802,10 @@ u64 hrtimer_get_next_event(void)
* Returns the next expiry time over all timers except for the @exclude one or
* KTIME_MAX if none of them is pending.
*/
-u64 hrtimer_next_event_without(const struct hrtimer *exclude)
+ktime_t hrtimer_next_event_without(const struct hrtimer *exclude)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- u64 expires = KTIME_MAX;
+ ktime_t expires = KTIME_MAX;
unsigned int active;
guard(raw_spinlock_irqsave)(&cpu_base->lock);
@@ -2213,7 +2315,11 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode
if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
mode |= HRTIMER_MODE_HARD;
- hrtimer_start_expires(&sl->timer, mode);
+ /* If already expired, clear the task pointer and set current state to running */
+ if (!hrtimer_start_expires_user(&sl->timer, mode)) {
+ sl->task = NULL;
+ __set_current_state(TASK_RUNNING);
+ }
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 1c954f330dfe..d51428867a33 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -60,15 +60,14 @@ EXPORT_SYMBOL(get_jiffies_64);
EXPORT_SYMBOL(jiffies);
-static int __init init_jiffies_clocksource(void)
-{
- return __clocksource_register(&clocksource_jiffies);
-}
-
-core_initcall(init_jiffies_clocksource);
+static bool cs_jiffies_registered __initdata;
struct clocksource * __init __weak clocksource_default_clock(void)
{
+ if (!cs_jiffies_registered) {
+ __clocksource_register(&clocksource_jiffies);
+ cs_jiffies_registered = true;
+ }
return &clocksource_jiffies;
}
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 4bca3f78c8ea..5fa0af66cf3f 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -57,6 +57,7 @@ ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
return tim;
}
+EXPORT_SYMBOL_GPL(do_timens_ktime_to_host);
static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
{
@@ -351,6 +352,7 @@ struct time_namespace init_time_ns = {
.user_ns = &init_user_ns,
.frozen_offsets = true,
};
+EXPORT_SYMBOL_GPL(init_time_ns);
void __init time_ns_init(void)
{
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0de2bb7cbec0..74775b94d11b 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -19,7 +19,7 @@
#include "posix-timers.h"
-static void posix_cpu_timer_rearm(struct k_itimer *timer);
+static bool posix_cpu_timer_rearm(struct k_itimer *timer);
void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
{
@@ -1011,24 +1011,27 @@ static void check_process_timers(struct task_struct *tsk,
/*
* This is called from the signal code (via posixtimer_rearm)
* when the last timer signal was delivered and we have to reload the timer.
+ *
+ * Return true unconditionally so the core code assumes the timer to be
+ * armed. Otherwise it would requeue the signal.
*/
-static void posix_cpu_timer_rearm(struct k_itimer *timer)
+static bool posix_cpu_timer_rearm(struct k_itimer *timer)
{
clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
- struct task_struct *p;
struct sighand_struct *sighand;
+ struct task_struct *p;
unsigned long flags;
u64 now;
- rcu_read_lock();
+ guard(rcu)();
p = cpu_timer_task_rcu(timer);
if (!p)
- goto out;
+ return true;
/* Protect timer list r/w in arm_timer() */
sighand = lock_task_sighand(p, &flags);
if (unlikely(sighand == NULL))
- goto out;
+ return true;
/*
* Fetch the current sample and update the timer's expiry time.
@@ -1045,8 +1048,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
*/
arm_timer(timer, p);
unlock_task_sighand(p, &flags);
-out:
- rcu_read_unlock();
+ return true;
}
/**
@@ -1504,6 +1506,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
spin_lock_irq(&timer.it_lock);
error = posix_cpu_timer_set(&timer, flags, &it, NULL);
if (error) {
+ posix_cpu_timer_del(&timer);
spin_unlock_irq(&timer.it_lock);
return error;
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 9331e1614124..436ba794cc0b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -288,16 +288,18 @@ static inline int timer_overrun_to_int(struct k_itimer *timr)
return (int)timr->it_overrun_last;
}
-static void common_hrtimer_rearm(struct k_itimer *timr)
+static bool common_hrtimer_rearm(struct k_itimer *timr)
{
struct hrtimer *timer = &timr->it.real.timer;
timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval);
- hrtimer_restart(timer);
+ return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS);
}
static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr)
{
+ bool queued;
+
guard(spinlock)(&timr->it_lock);
/*
@@ -311,12 +313,18 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it
if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
return true;
- timr->kclock->timer_rearm(timr);
- timr->it_status = POSIX_TIMER_ARMED;
+ /* timer_rearm() updates timr::it_overrun */
+ queued = timr->kclock->timer_rearm(timr);
+
timr->it_overrun_last = timr->it_overrun;
timr->it_overrun = -1LL;
++timr->it_signal_seq;
info->si_overrun = timer_overrun_to_int(timr);
+
+ if (queued)
+ timr->it_status = POSIX_TIMER_ARMED;
+ else
+ posix_timer_queue_signal(timr);
return true;
}
@@ -795,7 +803,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
return timer_overrun_to_int(scoped_timer);
}
-static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
+static bool common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
bool absolute, bool sigev_none)
{
struct hrtimer *timer = &timr->it.real.timer;
@@ -820,8 +828,11 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer));
hrtimer_set_expires(timer, expires);
- if (!sigev_none)
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ /* For sigev_none pretend that the timer is queued */
+ if (sigev_none)
+ return true;
+
+ return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS);
}
static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
@@ -903,9 +914,13 @@ int common_timer_set(struct k_itimer *timr, int flags,
expires = timens_ktime_to_host(timr->it_clock, expires);
sigev_none = timr->it_sigev_notify == SIGEV_NONE;
- kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
- if (!sigev_none)
- timr->it_status = POSIX_TIMER_ARMED;
+ if (kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none)) {
+ if (!sigev_none)
+ timr->it_status = POSIX_TIMER_ARMED;
+ } else {
+ /* Timer was already expired, queue the signal */
+ posix_timer_queue_signal(timr);
+ }
return 0;
}
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 7f259e845d24..4ea9611dd716 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -27,11 +27,11 @@ struct k_clock {
int (*timer_del)(struct k_itimer *timr);
void (*timer_get)(struct k_itimer *timr,
struct itimerspec64 *cur_setting);
- void (*timer_rearm)(struct k_itimer *timr);
+ bool (*timer_rearm)(struct k_itimer *timr);
s64 (*timer_forward)(struct k_itimer *timr, ktime_t now);
ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now);
int (*timer_try_to_cancel)(struct k_itimer *timr);
- void (*timer_arm)(struct k_itimer *timr, ktime_t expires,
+ bool (*timer_arm)(struct k_itimer *timr, ktime_t expires,
bool absolute, bool sigev_none);
void (*timer_wait_running)(struct k_itimer *timr);
};
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cbbb87a0c6e7..3026a301dff7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1407,8 +1407,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
* If the next highres timer to expire is earlier than 'next_event', the
* idle governor needs to know that.
*/
- next_event = min_t(u64, next_event,
- hrtimer_next_event_without(&ts->sched_timer));
+ next_event = min(next_event, hrtimer_next_event_without(&ts->sched_timer));
return ktime_sub(next_event, now);
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 04d928c21aba..655a8c6cd84d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1932,7 +1932,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
*/
static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
- u64 nextevt = hrtimer_get_next_event();
+ u64 nextevt = ktime_to_ns(hrtimer_get_next_event());
/*
* If high resolution timers are enabled
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 52c15affdbff..806c23cf71fc 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -102,7 +102,7 @@
* active CPU/group information atomic_try_cmpxchg() is used instead and only
* the per CPU tmigr_cpu->lock is held.
*
- * During the setup of groups tmigr_level_list is required. It is protected by
+ * During the setup of groups, hier->level_list is required. It is protected by
* @tmigr_mutex.
*
* When @timer_base->lock as well as tmigr related locks are required, the lock
@@ -416,13 +416,12 @@
*/
static DEFINE_MUTEX(tmigr_mutex);
-static struct list_head *tmigr_level_list __read_mostly;
+
+static LIST_HEAD(tmigr_hierarchy_list);
static unsigned int tmigr_hierarchy_levels __read_mostly;
static unsigned int tmigr_crossnode_level __read_mostly;
-static struct tmigr_group *tmigr_root;
-
static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
/*
@@ -1469,6 +1468,34 @@ static long tmigr_trigger_active(void *unused)
return 0;
}
+static unsigned int tmigr_get_capacity(int cpu)
+{
+ /*
+ * nohz_full CPUs need to make sure there is always an available (online)
+ * and never idle migrator to handle all their global timers. That duty
+ * is served by the timekeeper which then never stops its tick. But the
+ * timekeeper must then belong to the same hierarchy as all the nohz_full
+ * CPUs. Simply turn off capacity awareness when nohz_full is running.
+ */
+ if (tick_nohz_full_enabled() || !IS_ENABLED(CONFIG_BROKEN))
+ return SCHED_CAPACITY_SCALE;
+ else
+ return arch_scale_cpu_capacity(cpu);
+}
+
+static struct tmigr_hierarchy *__tmigr_get_hierarchy(int cpu)
+{
+ unsigned int capacity = tmigr_get_capacity(cpu);
+ struct tmigr_hierarchy *iter;
+
+ list_for_each_entry(iter, &tmigr_hierarchy_list, node) {
+ if (iter->capacity == capacity)
+ return iter;
+ }
+
+ return NULL;
+}
+
static int tmigr_clear_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1493,8 +1520,21 @@ static int tmigr_clear_cpu_available(unsigned int cpu)
}
if (firstexp != KTIME_MAX) {
- migrator = cpumask_any(tmigr_available_cpumask);
- work_on_cpu(migrator, tmigr_trigger_active, NULL);
+ struct tmigr_hierarchy *hier = __tmigr_get_hierarchy(cpu);
+
+ if (WARN_ON_ONCE(!hier))
+ return -EINVAL;
+
+ migrator = cpumask_any_and(tmigr_available_cpumask, hier->cpumask);
+ if (migrator < nr_cpu_ids) {
+ work_on_cpu(migrator, tmigr_trigger_active, NULL);
+ } else {
+ /*
+ * If deactivation returned an expiration, it belongs to an available
+ * nohz CPU in the hierarchy.
+ */
+ WARN_ONCE(1, "Expected available CPU in the hierarchy\n");
+ }
}
return 0;
@@ -1657,14 +1697,14 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
group->groupevt.ignore = true;
}
-static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int node, unsigned int lvl)
{
struct tmigr_group *tmp, *group = NULL;
lockdep_assert_held(&tmigr_mutex);
/* Try to attach to an existing group first */
- list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
+ list_for_each_entry(tmp, &hier->level_list[lvl], list) {
/*
* If @lvl is below the cross NUMA node level, check whether
* this group belongs to the same NUMA node.
@@ -1698,14 +1738,14 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
tmigr_init_group(group, lvl, node);
/* Setup successful. Add it to the hierarchy */
- list_add(&group->list, &tmigr_level_list[lvl]);
+ list_add(&group->list, &hier->level_list[lvl]);
trace_tmigr_group_set(group);
return group;
}
-static bool tmigr_init_root(struct tmigr_group *group, bool activate)
+static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *group, bool activate)
{
- if (!group->parent && group != tmigr_root) {
+ if (!group->parent && group != hier->root) {
/*
* This is the new top-level, prepare its groupmask in advance
* to avoid accidents where yet another new top-level is
@@ -1721,11 +1761,10 @@ static bool tmigr_init_root(struct tmigr_group *group, bool activate)
}
-static void tmigr_connect_child_parent(struct tmigr_group *child,
- struct tmigr_group *parent,
- bool activate)
+static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmigr_group *child,
+ struct tmigr_group *parent, bool activate)
{
- if (tmigr_init_root(parent, activate)) {
+ if (tmigr_init_root(hier, parent, activate)) {
/*
* The previous top level had prepared its groupmask already,
* simply account it in advance as the first child. If some groups
@@ -1758,13 +1797,13 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
*/
smp_store_release(&child->parent, parent);
- trace_tmigr_connect_child_parent(child);
+ trace_tmigr_connect_child_parent(hier, child);
}
-static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
- struct tmigr_group *start, bool activate)
+static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
+ unsigned int node, struct tmigr_group *start, bool activate)
{
- struct tmigr_group *group, *child, **stack;
+ struct tmigr_group *root = hier->root, *group, *child, **stack;
int i, top = 0, err = 0, start_lvl = 0;
bool root_mismatch = false;
@@ -1777,11 +1816,11 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
start_lvl = start->level + 1;
}
- if (tmigr_root)
- root_mismatch = tmigr_root->numa_node != node;
+ if (root)
+ root_mismatch = root->numa_node != node;
for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
- group = tmigr_get_group(node, i);
+ group = tmigr_get_group(hier, node, i);
if (IS_ERR(group)) {
err = PTR_ERR(group);
i--;
@@ -1803,7 +1842,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
if (group->parent)
break;
if ((!root_mismatch || i >= tmigr_crossnode_level) &&
- list_is_singular(&tmigr_level_list[i]))
+ list_is_singular(&hier->level_list[i]))
break;
}
@@ -1831,15 +1870,15 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
tmc->tmgroup = group;
tmc->groupmask = BIT(group->num_children++);
- tmigr_init_root(group, activate);
+ tmigr_init_root(hier, group, activate);
- trace_tmigr_connect_cpu_parent(tmc);
+ trace_tmigr_connect_cpu_parent(hier, tmc);
/* There are no children that need to be connected */
continue;
} else {
child = stack[i - 1];
- tmigr_connect_child_parent(child, group, activate);
+ tmigr_connect_child_parent(hier, child, group, activate);
}
}
@@ -1895,18 +1934,23 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
data.childmask = start->groupmask;
__walk_groups_from(tmigr_active_up, &data, start, start->parent);
}
+ } else if (start) {
+ union tmigr_state state;
+
+ /* Remote activation assumes the whole target's hierarchy is inactive */
+ state.state = atomic_read(&start->migr_state);
+ WARN_ON_ONCE(state.active);
}
/* Root update */
- if (list_is_singular(&tmigr_level_list[top])) {
- group = list_first_entry(&tmigr_level_list[top],
- typeof(*group), list);
+ if (list_is_singular(&hier->level_list[top])) {
+ group = list_first_entry(&hier->level_list[top], typeof(*group), list);
WARN_ON_ONCE(group->parent);
- if (tmigr_root) {
+ if (root) {
/* Old root should be the same or below */
- WARN_ON_ONCE(tmigr_root->level > top);
+ WARN_ON_ONCE(root->level > top);
}
- tmigr_root = group;
+ hier->root = group;
}
out:
kfree(stack);
@@ -1914,34 +1958,123 @@ out:
return err;
}
+static struct tmigr_hierarchy *tmigr_get_hierarchy(int cpu)
+{
+ struct tmigr_hierarchy *hier;
+
+ hier = __tmigr_get_hierarchy(cpu);
+
+ if (hier)
+ return hier;
+
+ hier = kzalloc_flex(*hier, level_list, tmigr_hierarchy_levels);
+ if (!hier)
+ return ERR_PTR(-ENOMEM);
+
+ hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!hier->cpumask) {
+ kfree(hier);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (int i = 0; i < tmigr_hierarchy_levels; i++)
+ INIT_LIST_HEAD(&hier->level_list[i]);
+
+ hier->capacity = tmigr_get_capacity(cpu);
+ list_add_tail(&hier->node, &tmigr_hierarchy_list);
+
+ return hier;
+}
+
+static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu,
+ struct tmigr_group *old_root, bool activate)
+{
+ /*
+ * The target CPU must never do the prepare work, except
+ * on early boot when the boot CPU is the target. Otherwise
+ * it may spuriously activate the old top level group inside
+ * the new one (nevertheless whether old top level group is
+ * active or not) and/or release an uninitialized childmask.
+ */
+ WARN_ON_ONCE(cpu == smp_processor_id());
+ if (activate) {
+ /*
+ * The current CPU is expected to be online in the hierarchy,
+ * otherwise the old root may not be active as expected.
+ */
+ WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
+ }
+
+ return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate);
+}
+
+static long connect_old_root_work(void *arg)
+{
+ struct tmigr_group *old_root = arg;
+ struct tmigr_hierarchy *hier;
+ int cpu = smp_processor_id();
+
+ hier = __tmigr_get_hierarchy(cpu);
+ if (WARN_ON_ONCE(!hier))
+ return -EINVAL;
+
+ return tmigr_connect_old_root(hier, cpu, old_root, true);
+}
+
static int tmigr_add_cpu(unsigned int cpu)
{
- struct tmigr_group *old_root = tmigr_root;
+ struct tmigr_hierarchy *hier;
+ struct tmigr_group *old_root;
int node = cpu_to_node(cpu);
int ret;
guard(mutex)(&tmigr_mutex);
- ret = tmigr_setup_groups(cpu, node, NULL, false);
+ hier = tmigr_get_hierarchy(cpu);
+ if (IS_ERR(hier))
+ return PTR_ERR(hier);
+
+ old_root = hier->root;
+
+ ret = tmigr_setup_groups(hier, cpu, node, NULL, false);
+
+ if (ret < 0)
+ return ret;
/* Root has changed? Connect the old one to the new */
- if (ret >= 0 && old_root && old_root != tmigr_root) {
- /*
- * The target CPU must never do the prepare work, except
- * on early boot when the boot CPU is the target. Otherwise
- * it may spuriously activate the old top level group inside
- * the new one (nevertheless whether old top level group is
- * active or not) and/or release an uninitialized childmask.
- */
- WARN_ON_ONCE(cpu == raw_smp_processor_id());
- /*
- * The (likely) current CPU is expected to be online in the hierarchy,
- * otherwise the old root may not be active as expected.
- */
- WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
- ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
+ if (old_root && old_root != hier->root) {
+ guard(migrate)();
+
+ if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) {
+ /*
+ * If the target belong to the same hierarchy, the old root is expected
+ * to be active. Link and propagate to the new root.
+ */
+ ret = tmigr_connect_old_root(hier, cpu, old_root, true);
+ } else {
+ int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask);
+
+ if (target < nr_cpu_ids) {
+ /*
+ * If the target doesn't belong to the same hierarchy as the current
+ * CPU, activate from a relevant one to make sure the old root is
+ * active.
+ */
+ ret = work_on_cpu(target, connect_old_root_work, old_root);
+ } else {
+ /*
+ * No other available CPUs in the remote hierarchy. Link the
+ * old root remotely but don't propagate activation since the
+ * old root is not expected to be active.
+ */
+ ret = tmigr_connect_old_root(hier, cpu, old_root, false);
+ }
+ }
}
+ if (ret >= 0)
+ cpumask_set_cpu(cpu, hier->cpumask);
+
return ret;
}
@@ -1974,7 +2107,7 @@ static int tmigr_cpu_prepare(unsigned int cpu)
static int __init tmigr_init(void)
{
- unsigned int cpulvl, nodelvl, cpus_per_node, i;
+ unsigned int cpulvl, nodelvl, cpus_per_node;
unsigned int nnodes = num_possible_nodes();
unsigned int ncpus = num_possible_cpus();
int ret = -ENOMEM;
@@ -2021,14 +2154,6 @@ static int __init tmigr_init(void)
*/
tmigr_crossnode_level = cpulvl;
- tmigr_level_list = kzalloc_objs(struct list_head,
- tmigr_hierarchy_levels);
- if (!tmigr_level_list)
- goto err;
-
- for (i = 0; i < tmigr_hierarchy_levels; i++)
- INIT_LIST_HEAD(&tmigr_level_list[i]);
-
pr_info("Timer migration: %d hierarchy levels; %d children per group;"
" %d crossnode level\n",
tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 70879cde6fdd..31735dd52327 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -6,6 +6,24 @@
#define TMIGR_CHILDREN_PER_GROUP 8
/**
+ * struct tmigr_hierarchy - a hierarchy associated to a given CPU capacity.
+ * Homogeneous systems have only one hierarchy.
+ * Heterogenous have one hierarchy per CPU capacity.
+ * @cpumask: CPUs belonging to this hierarchy
+ * @root: The current root of the hierarchy
+ * @capacity: CPU capacity associated to this hierarchy
+ * @node: Node in the global hierarchy list
+ * @level_list: Per level lists of tmigr groups
+ */
+struct tmigr_hierarchy {
+ struct cpumask *cpumask;
+ struct tmigr_group *root;
+ unsigned long capacity;
+ struct list_head node;
+ struct list_head level_list[];
+};
+
+/**
* struct tmigr_event - a timer event associated to a CPU
* @nextevt: The node to enqueue an event in the parent group queue
* @cpu: The CPU to which this event belongs
@@ -75,15 +93,17 @@ struct tmigr_group {
/**
* struct tmigr_cpu - timer migration per CPU group
* @lock: Lock protecting the tmigr_cpu group information
- * @online: Indicates whether the CPU is online; In deactivate path
- * it is required to know whether the migrator in the top
- * level group is to be set offline, while a timer is
- * pending. Then another online CPU needs to be notified to
- * take over the migrator role. Furthermore the information
- * is required in CPU hotplug path as the CPU is able to go
- * idle before the timer migration hierarchy hotplug AP is
- * reached. During this phase, the CPU has to handle the
+ * @available: Indicates whether the CPU is available for handling
+ * global timers. In the deactivate path it is required to
+ * know whether the migrator in the top level group is to
+ * be set offline, while a timer is pending. Then another
+ * available CPU needs to be notified to take over the
+ * migrator role. Furthermore the information is required
+ * in the CPU hotplug path as the CPU is able to go idle
+ * before the timer migration hierarchy hotplug callback is
+ * reached. During this phase, the CPU has to handle the
* global timers on its own and must not act as a migrator.
+
* @idle: Indicates whether the CPU is idle in the timer migration
* hierarchy
* @remote: Is set when timers of the CPU are expired remotely