12 files changed, 419 insertions, 167 deletions
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 6e173d70d825..ea5be5870e51 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -337,48 +337,32 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
 EXPORT_SYMBOL_GPL(alarm_init);
 
 /**
- * alarm_start - Sets an absolute alarm to fire
- * @alarm: ptr to alarm to set
- * @start: time to run the alarm
+ * alarm_start_timer - Sets an alarm to fire
+ * @alarm:	Pointer to alarm to set
+ * @expires:	Expiry time
+ * @relative:	True if @expires is relative
+ *
+ * Returns: True if the alarm was queued. False if it already expired
  */
-void alarm_start(struct alarm *alarm, ktime_t start)
+bool alarm_start_timer(struct alarm *alarm, ktime_t expires, bool relative)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 
-	scoped_guard(spinlock_irqsave, &base->lock) {
-		alarm->node.expires = start;
-		alarmtimer_enqueue(base, alarm);
-		hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
-	}
+	if (relative)
+		expires = ktime_add_safe(expires, base->get_ktime());
 
 	trace_alarmtimer_start(alarm, base->get_ktime());
-}
-EXPORT_SYMBOL_GPL(alarm_start);
-
-/**
- * alarm_start_relative - Sets a relative alarm to fire
- * @alarm: ptr to alarm to set
- * @start: time relative to now to run the alarm
- */
-void alarm_start_relative(struct alarm *alarm, ktime_t start)
-{
-	struct alarm_base *base = &alarm_bases[alarm->type];
-
-	start = ktime_add_safe(start, base->get_ktime());
-	alarm_start(alarm, start);
-}
-EXPORT_SYMBOL_GPL(alarm_start_relative);
-
-void alarm_restart(struct alarm *alarm)
-{
-	struct alarm_base *base = &alarm_bases[alarm->type];
 
 	guard(spinlock_irqsave)(&base->lock);
-	hrtimer_set_expires(&alarm->timer, alarm->node.expires);
-	hrtimer_restart(&alarm->timer);
+	alarm->node.expires = expires;
 	alarmtimer_enqueue(base, alarm);
+	if (!hrtimer_start_range_ns_user(&alarm->timer, expires, 0, HRTIMER_MODE_ABS)) {
+		alarmtimer_dequeue(base, alarm);
+		return false;
+	}
+	return true;
 }
-EXPORT_SYMBOL_GPL(alarm_restart);
+EXPORT_SYMBOL_GPL(alarm_start_timer);
 
 /**
  * alarm_try_to_cancel - Tries to cancel an alarm timer
@@ -512,8 +496,6 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
  * @now: time at the timer expiration
  *
  * Posix timer callback for expired alarm timers.
- *
- * Return: whether the timer is to be restarted
  */
 static void alarm_handle_timer(struct alarm *alarm, ktime_t now)
 {
@@ -527,12 +509,12 @@ static void alarm_handle_timer(struct alarm *alarm, ktime_t now)
  * alarm_timer_rearm - Posix timer callback for rearming timer
  * @timr:	Pointer to the posixtimer data struct
  */
-static void alarm_timer_rearm(struct k_itimer *timr)
+static bool alarm_timer_rearm(struct k_itimer *timr)
 {
 	struct alarm *alarm = &timr->it.alarm.alarmtimer;
 
 	timr->it_overrun += alarm_forward_now(alarm, timr->it_interval);
-	alarm_start(alarm, alarm->node.expires);
+	return alarm_start_timer(alarm, alarm->node.expires, false);
 }
 
 /**
@@ -588,7 +570,7 @@ static void alarm_timer_wait_running(struct k_itimer *timr)
  * @absolute:	Expiry value is absolute time
  * @sigev_none:	Posix timer does not deliver signals
  */
-static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
+static bool alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
 			    bool absolute, bool sigev_none)
 {
 	struct alarm *alarm = &timr->it.alarm.alarmtimer;
@@ -596,10 +578,16 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
 
 	if (!absolute)
 		expires = ktime_add_safe(expires, base->get_ktime());
-	if (sigev_none)
+
+	/*
+	 * sigev_none needs to update the expires value and pretend
+	 * that the timer is queued
+	 */
+	if (sigev_none) {
 		alarm->node.expires = expires;
-	else
-		alarm_start(&timr->it.alarm.alarmtimer, expires);
+		return true;
+	}
+	return alarm_start_timer(&timr->it.alarm.alarmtimer, expires, false);
 }
 
 /**
@@ -706,7 +694,9 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
 	alarm->data = (void *)current;
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
-		alarm_start(alarm, absexp);
+		if (!alarm_start_timer(alarm, absexp, false))
+			alarm->data = NULL;
+
 		if (likely(alarm->data))
 			schedule();
 
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 313f6c88148e..e48c4d379a7c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -1222,14 +1222,8 @@ static void clocksource_enqueue(struct clocksource *cs)
  * @cs:		clocksource to be registered
  * @scale:	Scale factor multiplied against freq to get clocksource hz
  * @freq:	clocksource frequency (cycles per second) divided by scale
- *
- * This should only be called from the clocksource->enable() method.
- *
- * This *SHOULD NOT* be called directly! Please use the
- * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
- * functions.
  */
-void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
+static void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
 	u64 sec;
 
@@ -1287,7 +1281,6 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
 	pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
 		cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
 }
-EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
 
 /**
  * __clocksource_register_scale - Used to install new clocksources
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5bd6efe598f0..638ce623c342 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1352,8 +1352,14 @@ static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool
 	return hrtimer_prefer_local(is_local, is_first, is_pinned);
 }
 
-static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
-				     const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
+enum {
+	HRTIMER_REPROGRAM_NONE,
+	HRTIMER_REPROGRAM,
+	HRTIMER_REPROGRAM_FORCE,
+};
+
+static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+				    const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
 {
 	struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
 	bool is_pinned, first, was_first, keep_base = false;
@@ -1410,7 +1416,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
 	/* If a deferred rearm is pending skip reprogramming the device */
 	if (cpu_base->deferred_rearm) {
 		cpu_base->deferred_needs_update = true;
-		return false;
+		return HRTIMER_REPROGRAM_NONE;
 	}
 
 	if (!was_first || cpu_base != this_cpu_base) {
@@ -1423,7 +1429,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
 		 * callbacks.
 		 */
 		if (likely(hrtimer_base_is_online(this_cpu_base)))
-			return first;
+			return first ? HRTIMER_REPROGRAM : HRTIMER_REPROGRAM_NONE;
 
 		/*
 		 * Timer was enqueued remote because the current base is
@@ -1432,7 +1438,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
 		 */
 		if (first)
 			smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd);
-		return false;
+		return HRTIMER_REPROGRAM_NONE;
 	}
 
 	/*
@@ -1446,7 +1452,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
 	 */
 	if (timer->is_lazy) {
 		if (cpu_base->expires_next <= hrtimer_get_expires(timer))
-			return false;
+			return HRTIMER_REPROGRAM_NONE;
 	}
 
 	/*
@@ -1455,8 +1461,24 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
 	 * reprogram the hardware by evaluating the new first expiring
 	 * timer.
 	 */
-	hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
-	return false;
+	return HRTIMER_REPROGRAM_FORCE;
+}
+
+static int hrtimer_start_range_ns_common(struct hrtimer *timer, ktime_t tim,
+					 u64 delta_ns, const enum hrtimer_mode mode,
+					 struct hrtimer_clock_base *base)
+{
+	/*
+	 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
+	 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
+	 * expiry mode because unmarked timers are moved to softirq expiry.
+	 */
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+	else
+		WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
+
+	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, base);
 }
 
 /**
@@ -1476,24 +1498,104 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
 
 	debug_hrtimer_assert_init(timer);
 
+	base = lock_hrtimer_base(timer, &flags);
+
+	switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) {
+	case HRTIMER_REPROGRAM:
+		hrtimer_reprogram(timer, true);
+		break;
+	case HRTIMER_REPROGRAM_FORCE:
+		hrtimer_force_reprogram(timer->base->cpu_base, 1);
+		break;
+	case HRTIMER_REPROGRAM_NONE:
+		break;
+	}
+
+	unlock_hrtimer_base(timer, &flags);
+}
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+static inline bool hrtimer_check_user_timer(struct hrtimer *timer)
+{
+	struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base;
+	ktime_t expires;
+
 	/*
-	 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
-	 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
-	 * expiry mode because unmarked timers are moved to softirq expiry.
+	 * This uses soft expires because that's the user provided
+	 * expiry time, while expires can be further in the past
+	 * due to a slack value added to the user expiry time.
 	 */
-	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
-		WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
-	else
-		WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
+	expires = hrtimer_get_softexpires(timer);
+
+	/* Convert to monotonic */
+	expires = ktime_sub(expires, timer->base->offset);
+
+	/*
+	 * Check whether this timer will end up as the first expiring timer in
+	 * the CPU base. If not, no further checks required as it's then
+	 * guaranteed to expire in the future.
+	 */
+	if (expires >= cpu_base->expires_next)
+		return true;
+
+	/* Validate that the expiry time is in the future. */
+	if (expires > ktime_get())
+		return true;
+
+	debug_hrtimer_deactivate(timer);
+	__remove_hrtimer(timer, timer->base, HRTIMER_STATE_INACTIVE, false);
+	trace_hrtimer_start_expired(timer);
+	return false;
+}
+
+/**
+ * hrtimer_start_range_ns_user - (re)start an user controlled hrtimer
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
+ * @mode:	timer mode: absolute (HRTIMER_MODE_ABS) or
+ *		relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
+ *		softirq based mode is considered for debug purpose only!
+ *
+ * Returns: True when the timer was queued, false if it was already expired
+ *
+ * This function cannot invoke the timer callback for expired timers as it might
+ * be called under a lock which the timer callback needs to acquire. So the
+ * caller has to handle that case.
+ */
+bool hrtimer_start_range_ns_user(struct hrtimer *timer, ktime_t tim,
+				 u64 delta_ns, const enum hrtimer_mode mode)
+{
+	struct hrtimer_clock_base *base;
+	unsigned long flags;
+	bool ret = true;
+
+	debug_hrtimer_assert_init(timer);
 
 	base = lock_hrtimer_base(timer, &flags);
 
-	if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
-		hrtimer_reprogram(timer, true);
+	switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) {
+	case HRTIMER_REPROGRAM:
+		ret = hrtimer_check_user_timer(timer);
+		if (ret)
+			hrtimer_reprogram(timer, true);
+		break;
+	case HRTIMER_REPROGRAM_FORCE:
+		ret = hrtimer_check_user_timer(timer);
+		/*
+		 * The base must always be reevaluated, independent of the
+		 * result above because the timer was the first pending timer.
+		 */
+		hrtimer_force_reprogram(timer->base->cpu_base, 1);
+		break;
+	case HRTIMER_REPROGRAM_NONE:
+		break;
+	}
 
 	unlock_hrtimer_base(timer, &flags);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns_user);
 
 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
@@ -1681,10 +1783,10 @@ EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
  *
  * Returns the next expiry time or KTIME_MAX if no timer is pending.
  */
-u64 hrtimer_get_next_event(void)
+ktime_t hrtimer_get_next_event(void)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	u64 expires = KTIME_MAX;
+	ktime_t expires = KTIME_MAX;
 
 	guard(raw_spinlock_irqsave)(&cpu_base->lock);
 	if (!hrtimer_hres_active(cpu_base))
@@ -1700,10 +1802,10 @@ u64 hrtimer_get_next_event(void)
  * Returns the next expiry time over all timers except for the @exclude one or
  * KTIME_MAX if none of them is pending.
  */
-u64 hrtimer_next_event_without(const struct hrtimer *exclude)
+ktime_t hrtimer_next_event_without(const struct hrtimer *exclude)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	u64 expires = KTIME_MAX;
+	ktime_t expires = KTIME_MAX;
 	unsigned int active;
 
 	guard(raw_spinlock_irqsave)(&cpu_base->lock);
@@ -2213,7 +2315,11 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode
 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
 		mode |= HRTIMER_MODE_HARD;
 
-	hrtimer_start_expires(&sl->timer, mode);
+	/* If already expired, clear the task pointer and set current state to running */
+	if (!hrtimer_start_expires_user(&sl->timer, mode)) {
+		sl->task = NULL;
+		__set_current_state(TASK_RUNNING);
+	}
 }
 EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
 
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 1c954f330dfe..d51428867a33 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -60,15 +60,14 @@ EXPORT_SYMBOL(get_jiffies_64);
 
 EXPORT_SYMBOL(jiffies);
 
-static int __init init_jiffies_clocksource(void)
-{
-	return __clocksource_register(&clocksource_jiffies);
-}
-
-core_initcall(init_jiffies_clocksource);
+static bool cs_jiffies_registered __initdata;
 
 struct clocksource * __init __weak clocksource_default_clock(void)
 {
+	if (!cs_jiffies_registered) {
+		__clocksource_register(&clocksource_jiffies);
+		cs_jiffies_registered = true;
+	}
 	return &clocksource_jiffies;
 }
 
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 4bca3f78c8ea..5fa0af66cf3f 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -57,6 +57,7 @@ ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
 
 	return tim;
 }
+EXPORT_SYMBOL_GPL(do_timens_ktime_to_host);
 
 static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
 {
@@ -351,6 +352,7 @@ struct time_namespace init_time_ns = {
 	.user_ns	= &init_user_ns,
 	.frozen_offsets	= true,
 };
+EXPORT_SYMBOL_GPL(init_time_ns);
 
 void __init time_ns_init(void)
 {
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0de2bb7cbec0..74775b94d11b 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -19,7 +19,7 @@
 
 #include "posix-timers.h"
 
-static void posix_cpu_timer_rearm(struct k_itimer *timer);
+static bool posix_cpu_timer_rearm(struct k_itimer *timer);
 
 void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
 {
@@ -1011,24 +1011,27 @@ static void check_process_timers(struct task_struct *tsk,
 /*
  * This is called from the signal code (via posixtimer_rearm)
  * when the last timer signal was delivered and we have to reload the timer.
+ *
+ * Return true unconditionally so the core code assumes the timer to be
+ * armed. Otherwise it would requeue the signal.
  */
-static void posix_cpu_timer_rearm(struct k_itimer *timer)
+static bool posix_cpu_timer_rearm(struct k_itimer *timer)
 {
 	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
-	struct task_struct *p;
 	struct sighand_struct *sighand;
+	struct task_struct *p;
 	unsigned long flags;
 	u64 now;
 
-	rcu_read_lock();
+	guard(rcu)();
 	p = cpu_timer_task_rcu(timer);
 	if (!p)
-		goto out;
+		return true;
 
 	/* Protect timer list r/w in arm_timer() */
 	sighand = lock_task_sighand(p, &flags);
 	if (unlikely(sighand == NULL))
-		goto out;
+		return true;
 
 	/*
 	 * Fetch the current sample and update the timer's expiry time.
@@ -1045,8 +1048,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
 	 */
 	arm_timer(timer, p);
 	unlock_task_sighand(p, &flags);
-out:
-	rcu_read_unlock();
+	return true;
 }
 
 /**
@@ -1504,6 +1506,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		spin_lock_irq(&timer.it_lock);
 		error = posix_cpu_timer_set(&timer, flags, &it, NULL);
 		if (error) {
+			posix_cpu_timer_del(&timer);
 			spin_unlock_irq(&timer.it_lock);
 			return error;
 		}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 9331e1614124..436ba794cc0b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -288,16 +288,18 @@ static inline int timer_overrun_to_int(struct k_itimer *timr)
 	return (int)timr->it_overrun_last;
 }
 
-static void common_hrtimer_rearm(struct k_itimer *timr)
+static bool common_hrtimer_rearm(struct k_itimer *timr)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
 
 	timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval);
-	hrtimer_restart(timer);
+	return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS);
 }
 
 static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr)
 {
+	bool queued;
+
 	guard(spinlock)(&timr->it_lock);
 
 	/*
@@ -311,12 +313,18 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it
 	if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
 		return true;
 
-	timr->kclock->timer_rearm(timr);
-	timr->it_status = POSIX_TIMER_ARMED;
+	/* timer_rearm() updates timr::it_overrun */
+	queued = timr->kclock->timer_rearm(timr);
+
 	timr->it_overrun_last = timr->it_overrun;
 	timr->it_overrun = -1LL;
 	++timr->it_signal_seq;
 	info->si_overrun = timer_overrun_to_int(timr);
+
+	if (queued)
+		timr->it_status = POSIX_TIMER_ARMED;
+	else
+		posix_timer_queue_signal(timr);
 	return true;
 }
 
@@ -795,7 +803,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 		return timer_overrun_to_int(scoped_timer);
 }
 
-static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
+static bool common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
 			       bool absolute, bool sigev_none)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
@@ -820,8 +828,11 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
 		expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer));
 	hrtimer_set_expires(timer, expires);
 
-	if (!sigev_none)
-		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+	/* For sigev_none pretend that the timer is queued */
+	if (sigev_none)
+		return true;
+
+	return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS);
 }
 
 static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
@@ -903,9 +914,13 @@ int common_timer_set(struct k_itimer *timr, int flags,
 		expires = timens_ktime_to_host(timr->it_clock, expires);
 	sigev_none = timr->it_sigev_notify == SIGEV_NONE;
 
-	kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
-	if (!sigev_none)
-		timr->it_status = POSIX_TIMER_ARMED;
+	if (kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none)) {
+		if (!sigev_none)
+			timr->it_status = POSIX_TIMER_ARMED;
+	} else {
+		/* Timer was already expired, queue the signal */
+		posix_timer_queue_signal(timr);
+	}
 	return 0;
 }
 
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 7f259e845d24..4ea9611dd716 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -27,11 +27,11 @@ struct k_clock {
 	int	(*timer_del)(struct k_itimer *timr);
 	void	(*timer_get)(struct k_itimer *timr,
 			     struct itimerspec64 *cur_setting);
-	void	(*timer_rearm)(struct k_itimer *timr);
+	bool	(*timer_rearm)(struct k_itimer *timr);
 	s64	(*timer_forward)(struct k_itimer *timr, ktime_t now);
 	ktime_t	(*timer_remaining)(struct k_itimer *timr, ktime_t now);
 	int	(*timer_try_to_cancel)(struct k_itimer *timr);
-	void	(*timer_arm)(struct k_itimer *timr, ktime_t expires,
+	bool	(*timer_arm)(struct k_itimer *timr, ktime_t expires,
 			     bool absolute, bool sigev_none);
 	void	(*timer_wait_running)(struct k_itimer *timr);
 };
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cbbb87a0c6e7..3026a301dff7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1407,8 +1407,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 	 * If the next highres timer to expire is earlier than 'next_event', the
 	 * idle governor needs to know that.
 	 */
-	next_event = min_t(u64, next_event,
-			   hrtimer_next_event_without(&ts->sched_timer));
+	next_event = min(next_event, hrtimer_next_event_without(&ts->sched_timer));
 
 	return ktime_sub(next_event, now);
 }
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 04d928c21aba..655a8c6cd84d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1932,7 +1932,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
  */
 static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
 {
-	u64 nextevt = hrtimer_get_next_event();
+	u64 nextevt = ktime_to_ns(hrtimer_get_next_event());
 
 	/*
 	 * If high resolution timers are enabled
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 52c15affdbff..806c23cf71fc 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -102,7 +102,7 @@
  * active CPU/group information atomic_try_cmpxchg() is used instead and only
  * the per CPU tmigr_cpu->lock is held.
  *
- * During the setup of groups tmigr_level_list is required. It is protected by
+ * During the setup of groups, hier->level_list is required. It is protected by
  * @tmigr_mutex.
  *
  * When @timer_base->lock as well as tmigr related locks are required, the lock
@@ -416,13 +416,12 @@
  */
 
 static DEFINE_MUTEX(tmigr_mutex);
-static struct list_head *tmigr_level_list __read_mostly;
+
+static LIST_HEAD(tmigr_hierarchy_list);
 
 static unsigned int tmigr_hierarchy_levels __read_mostly;
 static unsigned int tmigr_crossnode_level __read_mostly;
 
-static struct tmigr_group *tmigr_root;
-
 static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
 
 /*
@@ -1469,6 +1468,34 @@ static long tmigr_trigger_active(void *unused)
 	return 0;
 }
 
+static unsigned int tmigr_get_capacity(int cpu)
+{
+	/*
+	 * nohz_full CPUs need to make sure there is always an available (online)
+	 * and never idle migrator to handle all their global timers. That duty
+	 * is served by the timekeeper which then never stops its tick. But the
+	 * timekeeper must then belong to the same hierarchy as all the nohz_full
+	 * CPUs. Simply turn off capacity awareness when nohz_full is running.
+	 */
+	if (tick_nohz_full_enabled() || !IS_ENABLED(CONFIG_BROKEN))
+		return SCHED_CAPACITY_SCALE;
+	else
+		return arch_scale_cpu_capacity(cpu);
+}
+
+static struct tmigr_hierarchy *__tmigr_get_hierarchy(int cpu)
+{
+	unsigned int capacity = tmigr_get_capacity(cpu);
+	struct tmigr_hierarchy *iter;
+
+	list_for_each_entry(iter, &tmigr_hierarchy_list, node) {
+		if (iter->capacity == capacity)
+			return iter;
+	}
+
+	return NULL;
+}
+
 static int tmigr_clear_cpu_available(unsigned int cpu)
 {
 	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1493,8 +1520,21 @@ static int tmigr_clear_cpu_available(unsigned int cpu)
 	}
 
 	if (firstexp != KTIME_MAX) {
-		migrator = cpumask_any(tmigr_available_cpumask);
-		work_on_cpu(migrator, tmigr_trigger_active, NULL);
+		struct tmigr_hierarchy *hier = __tmigr_get_hierarchy(cpu);
+
+		if (WARN_ON_ONCE(!hier))
+			return -EINVAL;
+
+		migrator = cpumask_any_and(tmigr_available_cpumask, hier->cpumask);
+		if (migrator < nr_cpu_ids) {
+			work_on_cpu(migrator, tmigr_trigger_active, NULL);
+		} else {
+			/*
+			 * If deactivation returned an expiration, it belongs to an available
+			 * nohz CPU in the hierarchy.
+			 */
+			WARN_ONCE(1, "Expected available CPU in the hierarchy\n");
+		}
 	}
 
 	return 0;
@@ -1657,14 +1697,14 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
 	group->groupevt.ignore = true;
 }
 
-static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int node, unsigned int lvl)
 {
 	struct tmigr_group *tmp, *group = NULL;
 
 	lockdep_assert_held(&tmigr_mutex);
 
 	/* Try to attach to an existing group first */
-	list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
+	list_for_each_entry(tmp, &hier->level_list[lvl], list) {
 		/*
 		 * If @lvl is below the cross NUMA node level, check whether
 		 * this group belongs to the same NUMA node.
@@ -1698,14 +1738,14 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
 	tmigr_init_group(group, lvl, node);
 
 	/* Setup successful. Add it to the hierarchy */
-	list_add(&group->list, &tmigr_level_list[lvl]);
+	list_add(&group->list, &hier->level_list[lvl]);
 	trace_tmigr_group_set(group);
 	return group;
 }
 
-static bool tmigr_init_root(struct tmigr_group *group, bool activate)
+static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *group, bool activate)
 {
-	if (!group->parent && group != tmigr_root) {
+	if (!group->parent && group != hier->root) {
 		/*
 		 * This is the new top-level, prepare its groupmask in advance
 		 * to avoid accidents where yet another new top-level is
@@ -1721,11 +1761,10 @@ static bool tmigr_init_root(struct tmigr_group *group, bool activate)
 
 }
 
-static void tmigr_connect_child_parent(struct tmigr_group *child,
-				       struct tmigr_group *parent,
-				       bool activate)
+static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmigr_group *child,
+				       struct tmigr_group *parent, bool activate)
 {
-	if (tmigr_init_root(parent, activate)) {
+	if (tmigr_init_root(hier, parent, activate)) {
 		/*
 		 * The previous top level had prepared its groupmask already,
 		 * simply account it in advance as the first child. If some groups
@@ -1758,13 +1797,13 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 	 */
 	smp_store_release(&child->parent, parent);
 
-	trace_tmigr_connect_child_parent(child);
+	trace_tmigr_connect_child_parent(hier, child);
 }
 
-static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
-			      struct tmigr_group *start, bool activate)
+static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
+			      unsigned int node, struct tmigr_group *start, bool activate)
 {
-	struct tmigr_group *group, *child, **stack;
+	struct tmigr_group *root = hier->root, *group, *child, **stack;
 	int i, top = 0, err = 0, start_lvl = 0;
 	bool root_mismatch = false;
 
@@ -1777,11 +1816,11 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 		start_lvl = start->level + 1;
 	}
 
-	if (tmigr_root)
-		root_mismatch = tmigr_root->numa_node != node;
+	if (root)
+		root_mismatch = root->numa_node != node;
 
 	for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
-		group = tmigr_get_group(node, i);
+		group = tmigr_get_group(hier, node, i);
 		if (IS_ERR(group)) {
 			err = PTR_ERR(group);
 			i--;
@@ -1803,7 +1842,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 		if (group->parent)
 			break;
 		if ((!root_mismatch || i >= tmigr_crossnode_level) &&
-		    list_is_singular(&tmigr_level_list[i]))
+		    list_is_singular(&hier->level_list[i]))
 			break;
 	}
 
@@ -1831,15 +1870,15 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 			tmc->tmgroup = group;
 			tmc->groupmask = BIT(group->num_children++);
 
-			tmigr_init_root(group, activate);
+			tmigr_init_root(hier, group, activate);
 
-			trace_tmigr_connect_cpu_parent(tmc);
+			trace_tmigr_connect_cpu_parent(hier, tmc);
 
 			/* There are no children that need to be connected */
 			continue;
 		} else {
 			child = stack[i - 1];
-			tmigr_connect_child_parent(child, group, activate);
+			tmigr_connect_child_parent(hier, child, group, activate);
 		}
 	}
 
@@ -1895,18 +1934,23 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 			data.childmask = start->groupmask;
 			__walk_groups_from(tmigr_active_up, &data, start, start->parent);
 		}
+	} else if (start) {
+		union tmigr_state state;
+
+		/* Remote activation assumes the whole target's hierarchy is inactive */
+		state.state = atomic_read(&start->migr_state);
+		WARN_ON_ONCE(state.active);
 	}
 
 	/* Root update */
-	if (list_is_singular(&tmigr_level_list[top])) {
-		group = list_first_entry(&tmigr_level_list[top],
-					 typeof(*group), list);
+	if (list_is_singular(&hier->level_list[top])) {
+		group = list_first_entry(&hier->level_list[top], typeof(*group), list);
 		WARN_ON_ONCE(group->parent);
-		if (tmigr_root) {
+		if (root) {
 			/* Old root should be the same or below */
-			WARN_ON_ONCE(tmigr_root->level > top);
+			WARN_ON_ONCE(root->level > top);
 		}
-		tmigr_root = group;
+		hier->root = group;
 	}
 out:
 	kfree(stack);
@@ -1914,34 +1958,123 @@ out:
 	return err;
 }
 
+static struct tmigr_hierarchy *tmigr_get_hierarchy(int cpu)
+{
+	struct tmigr_hierarchy *hier;
+
+	hier = __tmigr_get_hierarchy(cpu);
+
+	if (hier)
+		return hier;
+
+	hier = kzalloc_flex(*hier, level_list, tmigr_hierarchy_levels);
+	if (!hier)
+		return ERR_PTR(-ENOMEM);
+
+	hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
+	if (!hier->cpumask) {
+		kfree(hier);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (int i = 0; i < tmigr_hierarchy_levels; i++)
+		INIT_LIST_HEAD(&hier->level_list[i]);
+
+	hier->capacity = tmigr_get_capacity(cpu);
+	list_add_tail(&hier->node, &tmigr_hierarchy_list);
+
+	return hier;
+}
+
+static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu,
+				  struct tmigr_group *old_root,	bool activate)
+{
+	/*
+	 * The target CPU must never do the prepare work, except
+	 * on early boot when the boot CPU is the target. Otherwise
+	 * it may spuriously activate the old top level group inside
+	 * the new one (nevertheless whether old top level group is
+	 * active or not) and/or release an uninitialized childmask.
+	 */
+	WARN_ON_ONCE(cpu == smp_processor_id());
+	if (activate) {
+		/*
+		 * The current CPU is expected to be online in the hierarchy,
+		 * otherwise the old root may not be active as expected.
+		 */
+		WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
+	}
+
+	return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate);
+}
+
+static long connect_old_root_work(void *arg)
+{
+	struct tmigr_group *old_root = arg;
+	struct tmigr_hierarchy *hier;
+	int cpu = smp_processor_id();
+
+	hier = __tmigr_get_hierarchy(cpu);
+	if (WARN_ON_ONCE(!hier))
+		return -EINVAL;
+
+	return tmigr_connect_old_root(hier, cpu, old_root, true);
+}
+
 static int tmigr_add_cpu(unsigned int cpu)
 {
-	struct tmigr_group *old_root = tmigr_root;
+	struct tmigr_hierarchy *hier;
+	struct tmigr_group *old_root;
 	int node = cpu_to_node(cpu);
 	int ret;
 
 	guard(mutex)(&tmigr_mutex);
 
-	ret = tmigr_setup_groups(cpu, node, NULL, false);
+	hier = tmigr_get_hierarchy(cpu);
+	if (IS_ERR(hier))
+		return PTR_ERR(hier);
+
+	old_root = hier->root;
+
+	ret = tmigr_setup_groups(hier, cpu, node, NULL, false);
+
+	if (ret < 0)
+		return ret;
 
 	/* Root has changed? Connect the old one to the new */
-	if (ret >= 0 && old_root && old_root != tmigr_root) {
-		/*
-		 * The target CPU must never do the prepare work, except
-		 * on early boot when the boot CPU is the target. Otherwise
-		 * it may spuriously activate the old top level group inside
-		 * the new one (nevertheless whether old top level group is
-		 * active or not) and/or release an uninitialized childmask.
-		 */
-		WARN_ON_ONCE(cpu == raw_smp_processor_id());
-		/*
-		 * The (likely) current CPU is expected to be online in the hierarchy,
-		 * otherwise the old root may not be active as expected.
-		 */
-		WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
-		ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
+	if (old_root && old_root != hier->root) {
+		guard(migrate)();
+
+		if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) {
+			/*
+			 * If the target belong to the same hierarchy, the old root is expected
+			 * to be active. Link and propagate to the new root.
+			 */
+			ret = tmigr_connect_old_root(hier, cpu, old_root, true);
+		} else {
+			int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask);
+
+			if (target < nr_cpu_ids) {
+				/*
+				 * If the target doesn't belong to the same hierarchy as the current
+				 * CPU, activate from a relevant one to make sure the old root is
+				 * active.
+				 */
+				ret = work_on_cpu(target, connect_old_root_work, old_root);
+			} else {
+				/*
+				 * No other available CPUs in the remote hierarchy. Link the
+				 * old root remotely but don't propagate activation since the
+				 * old root is not expected to be active.
+				 */
+				ret = tmigr_connect_old_root(hier, cpu, old_root, false);
+			}
+		}
 	}
 
+	if (ret >= 0)
+		cpumask_set_cpu(cpu, hier->cpumask);
+
 	return ret;
 }
 
@@ -1974,7 +2107,7 @@ static int tmigr_cpu_prepare(unsigned int cpu)
 
 static int __init tmigr_init(void)
 {
-	unsigned int cpulvl, nodelvl, cpus_per_node, i;
+	unsigned int cpulvl, nodelvl, cpus_per_node;
 	unsigned int nnodes = num_possible_nodes();
 	unsigned int ncpus = num_possible_cpus();
 	int ret = -ENOMEM;
@@ -2021,14 +2154,6 @@ static int __init tmigr_init(void)
 	 */
 	tmigr_crossnode_level = cpulvl;
 
-	tmigr_level_list = kzalloc_objs(struct list_head,
-					tmigr_hierarchy_levels);
-	if (!tmigr_level_list)
-		goto err;
-
-	for (i = 0; i < tmigr_hierarchy_levels; i++)
-		INIT_LIST_HEAD(&tmigr_level_list[i]);
-
 	pr_info("Timer migration: %d hierarchy levels; %d children per group;"
 		" %d crossnode level\n",
 		tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 70879cde6fdd..31735dd52327 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -6,6 +6,24 @@
 #define TMIGR_CHILDREN_PER_GROUP 8
 
 /**
+ * struct tmigr_hierarchy - a hierarchy associated to a given CPU capacity.
+ *                          Homogeneous systems have only one hierarchy.
+ *                          Heterogenous have one hierarchy per CPU capacity.
+ * @cpumask:	CPUs belonging to this hierarchy
+ * @root:	The current root of the hierarchy
+ * @capacity:	CPU capacity associated to this hierarchy
+ * @node:	Node in the global hierarchy list
+ * @level_list:	Per level lists of tmigr groups
+ */
+struct tmigr_hierarchy {
+	struct cpumask		*cpumask;
+	struct tmigr_group	*root;
+	unsigned long		capacity;
+	struct list_head	node;
+	struct list_head	level_list[];
+};
+
+/**
  * struct tmigr_event - a timer event associated to a CPU
  * @nextevt:	The node to enqueue an event in the parent group queue
  * @cpu:	The CPU to which this event belongs
@@ -75,15 +93,17 @@ struct tmigr_group {
 /**
  * struct tmigr_cpu - timer migration per CPU group
  * @lock:		Lock protecting the tmigr_cpu group information
- * @online:		Indicates whether the CPU is online; In deactivate path
- *			it is required to know whether the migrator in the top
- *			level group is to be set offline, while a timer is
- *			pending. Then another online CPU needs to be notified to
- *			take over the migrator role. Furthermore the information
- *			is required in CPU hotplug path as the CPU is able to go
- *			idle before the timer migration hierarchy hotplug AP is
- *			reached. During this phase, the CPU has to handle the
+ * @available:		Indicates whether the CPU is available for handling
+ *			global timers. In the deactivate path it is required to
+ *			know whether the migrator in the top level group is to
+ *			be set offline, while a timer is pending. Then another
+ *			available CPU needs to be notified to take over the
+ *			migrator role. Furthermore the information is required
+ *			in the CPU hotplug path as the CPU is able to go idle
+ *			before the timer migration hierarchy hotplug callback is
+ *			reached.  During this phase, the CPU has to handle the
  *			global timers on its own and must not act as a migrator.
+
  * @idle:		Indicates whether the CPU is idle in the timer migration
  *			hierarchy
  * @remote:		Is set when timers of the CPU are expired remotely