Merge tag 'timers-core-2026-06-13' of gitolite.kernel.org:pub/scm/linux/kernel/git/tip/tip

Pull timer core updates from Thomas Gleixner: "Updates for the time/timer core subsystem: - Harden the user space controllable hrtimer interfaces further to protect against unpriviledged DoS attempts by arming timers in the past. - Add per-capacity hierarchies to the timer migration code to prevent timer migration accross different capacity domains. This code has been disabled last minute as there is a pathological problem with SoCs which advertise a larger number of capacity domains. The problem is under investigation and the code won't be active before v7.3, but that turned out to be less intrusive than a full revert as it preserves the preparatory steps and allows people to work on the final resolution - Export time namespace functionality as a recent user can be built as a module. - Initialize the jiffies clocksource before using it. The recent hardening against time moving backward requires that the related members of struct clocksource have been initialized, otherwise it clamps the readout to 0, which makes time stand sill and causes boot delays. - Fix a more than twenty year old PID reference count leak in an error path of the POSIX CPU timer code. - The usual small fixes, improvements and cleanups all over the place" * tag 'timers-core-2026-06-13' of gitolite.kernel.org:pub/scm/linux/kernel/git/tip/tip: (31 commits) posix-cpu-timers: Fix pid refcount leak in do_cpu_nanosleep() error path time/jiffies: Register jiffies clocksource before usage timers/migration: Temporarily disable per capacity hierarchies timers/migration: Turn tmigr_hierarchy level_list into a flexible array timers/migration: Deactivate per-capacity hierarchies under nohz_full timers/migration: Fix hotplug migrator selection target on asymetric capacity machines ntsync: Honour caller's time namespace for absolute MONOTONIC timeouts time/namespace: Export init_time_ns and do_timens_ktime_to_host() timers/migration: Update stale @online doc to @available timers: Fix flseep() typo in kernel-doc comment hrtimer: Fix the bogus return type of __hrtimer_start_range_ns() hrtimer: Return ktime_t from hrtimer_get_next_event()/hrtimer_next_event_without() clocksource: Clean up clocksource_update_freq() functions alarmtimer: Remove stale return description from alarm_handle_timer() selftests/posix_timers: Use CLOCK_THREAD_CPUTIME_ID for ITIMER_PROF measurements scripts/timers: Add timer_migration_tree.py timers/migration: Handle capacity in connect tracepoints timers/migration: Split per-capacity hierarchies timers/migration: Track CPUs in a hierarchy timers/migration: Abstract out hierarchy to prepare for CPU capacity awareness ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2026-06-15 13:39:12 +0530
committer: Linus Torvalds <torvalds@linux-foundation.org> 2026-06-15 13:39:12 +0530
commit: a60ce761d99ff2d9eefe33374c5f20726465a140 (patch)
tree: a7883dcce89453fe59d7cf614620e0b20bea3895 /kernel/time
parent: f20e2fdaaeb74330a6c5d65af22a8c47409a7a91 (diff)
parent: 87bd2ad568e15b90d5f7d4bcd70342d05dad649c (diff)
12 files changed, 419 insertions, 167 deletions
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 6e173d70d825..ea5be5870e51 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -337,48 +337,32 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
 EXPORT_SYMBOL_GPL(alarm_init);
 
 /**
- * alarm_start - Sets an absolute alarm to fire
- * @alarm: ptr to alarm to set
- * @start: time to run the alarm
+ * alarm_start_timer - Sets an alarm to fire
+ * @alarm:	Pointer to alarm to set
+ * @expires:	Expiry time
+ * @relative:	True if @expires is relative
+ *
+ * Returns: True if the alarm was queued. False if it already expired
  */
-void alarm_start(struct alarm *alarm, ktime_t start)
+bool alarm_start_timer(struct alarm *alarm, ktime_t expires, bool relative)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 
-	scoped_guard(spinlock_irqsave, &base->lock) {
-		alarm->node.expires = start;
-		alarmtimer_enqueue(base, alarm);
-		hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
-	}
+	if (relative)
+		expires = ktime_add_safe(expires, base->get_ktime());
 
 	trace_alarmtimer_start(alarm, base->get_ktime());
-}
-EXPORT_SYMBOL_GPL(alarm_start);
-
-/**
- * alarm_start_relative - Sets a relative alarm to fire
- * @alarm: ptr to alarm to set
- * @start: time relative to now to run the alarm
- */
-void alarm_start_relative(struct alarm *alarm, ktime_t start)
-{
-	struct alarm_base *base = &alarm_bases[alarm->type];
-
-	start = ktime_add_safe(start, base->get_ktime());
-	alarm_start(alarm, start);
-}
-EXPORT_SYMBOL_GPL(alarm_start_relative);
-
-void alarm_restart(struct alarm *alarm)
-{
-	struct alarm_base *base = &alarm_bases[alarm->type];
 
 	guard(spinlock_irqsave)(&base->lock);
-	hrtimer_set_expires(&alarm->timer, alarm->node.expires);
-	hrtimer_restart(&alarm->timer);
+	alarm->node.expires = expires;
 	alarmtimer_enqueue(base, alarm);
+	if (!hrtimer_start_range_ns_user(&alarm->timer, expires, 0, HRTIMER_MODE_ABS)) {
+		alarmtimer_dequeue(base, alarm);
+		return false;
+	}
+	return true;
 }
-EXPORT_SYMBOL_GPL(alarm_restart);
+EXPORT_SYMBOL_GPL(alarm_start_timer);
 
 /**
  * alarm_try_to_cancel - Tries to cancel an alarm timer
@@ -512,8 +496,6 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
  * @now: time at the timer expiration
  *
  * Posix timer callback for expired alarm timers.
- *
- * Return: whether the timer is to be restarted
  */
 static void alarm_handle_timer(struct alarm *alarm, ktime_t now)
 {
@@ -527,12 +509,12 @@ static void alarm_handle_timer(struct alarm *alarm, ktime_t now)
  * alarm_timer_rearm - Posix timer callback for rearming timer
  * @timr:	Pointer to the posixtimer data struct
  */
-static void alarm_timer_rearm(struct k_itimer *timr)
+static bool alarm_timer_rearm(struct k_itimer *timr)
 {
 	struct alarm *alarm = &timr->it.alarm.alarmtimer;
 
 	timr->it_overrun += alarm_forward_now(alarm, timr->it_interval);
-	alarm_start(alarm, alarm->node.expires);
+	return alarm_start_timer(alarm, alarm->node.expires, false);
 }
 
 /**
@@ -588,7 +570,7 @@ static void alarm_timer_wait_running(struct k_itimer *timr)
  * @absolute:	Expiry value is absolute time
  * @sigev_none:	Posix timer does not deliver signals
  */
-static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
+static bool alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
 			    bool absolute, bool sigev_none)
 {
 	struct alarm *alarm = &timr->it.alarm.alarmtimer;
@@ -596,10 +578,16 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
 
 	if (!absolute)
 		expires = ktime_add_safe(expires, base->get_ktime());
-	if (sigev_none)
+
+	/*
+	 * sigev_none needs to update the expires value and pretend
+	 * that the timer is queued
+	 */
+	if (sigev_none) {
 		alarm->node.expires = expires;
-	else
-		alarm_start(&timr->it.alarm.alarmtimer, expires);
+		return true;
+	}
+	return alarm_start_timer(&timr->it.alarm.alarmtimer, expires, false);
 }
 
 /**
@@ -706,7 +694,9 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
 	alarm->data = (void *)current;
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
-		alarm_start(alarm, absexp);
+		if (!alarm_start_timer(alarm, absexp, false))
+			alarm->data = NULL;
+
 		if (likely(alarm->data))
 			schedule();
 
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 313f6c88148e..e48c4d379a7c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -1222,14 +1222,8 @@ static void clocksource_enqueue(struct clocksource *cs)
  * @cs:		clocksource to be registered
  * @scale:	Scale factor multiplied against freq to get clocksource hz
  * @freq:	clocksource frequency (cycles per second) divided by scale
- *
- * This should only be called from the clocksource->enable() method.
- *
- * This *SHOULD NOT* be called directly! Please use the
- * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
- * functions.
  */
-void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
+static void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
 	u64 sec;
 
@@ -1287,7 +1281,6 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
 	pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
 		cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
 }
-EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
 
 /**
  * __clocksource_register_scale - Used to install new clocksources
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5bd6efe598f0..638ce623c342 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1352,8 +1352,14 @@ static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool
 	return hrtimer_prefer_local(is_local, is_first, is_pinned);
 }
 
-static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
-				     const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
+enum {
+	HRTIMER_REPROGRAM_NONE,
+	HRTIMER_REPROGRAM,
+	HRTIMER_REPROGRAM_FORCE,
+};
+
+static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+				    const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
 {
 	struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
 	bool is_pinned, first, was_first, keep_base = false;
@@ -1410,7 +1416,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
 	/* If a deferred rearm is pending skip reprogramming the device */
 	if (cpu_base->deferred_rearm) {
 		cpu_base->deferred_needs_update = true;
-		return false;
+		return HRTIMER_REPROGRAM_NONE;
 	}
 
 	if (!was_first || cpu_base != this_cpu_base) {
@@ -1423,7 +1429,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
 		 * callbacks.
 		 */
 		if (likely(hrtimer_base_is_online(this_cpu_base)))
-			return first;
+			return first ? HRTIMER_REPROGRAM : HRTIMER_REPROGRAM_NONE;
 
 		/*
 		 * Timer was enqueued remote because the current base is
@@ -1432,7 +1438,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
 		 */
 		if (first)
 			smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd);
-		return false;
+		return HRTIMER_REPROGRAM_NONE;
 	}
 
 	/*
@@ -1446,7 +1452,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
 	 */
 	if (timer->is_lazy) {
 		if (cpu_base->expires_next <= hrtimer_get_expires(timer))
-			return false;
+			return HRTIMER_REPROGRAM_NONE;
 	}
 
 	/*
@@ -1455,8 +1461,24 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del
 	 * reprogram the hardware by evaluating the new first expiring
 	 * timer.
 	 */
-	hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
-	return false;
+	return HRTIMER_REPROGRAM_FORCE;
+}
+
+static int hrtimer_start_range_ns_common(struct hrtimer *timer, ktime_t tim,
+					 u64 delta_ns, const enum hrtimer_mode mode,
+					 struct hrtimer_clock_base *base)
+{
+	/*
+	 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
+	 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
+	 * expiry mode because unmarked timers are moved to softirq expiry.
+	 */
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+	else
+		WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
+
+	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, base);
 }
 
 /**
@@ -1476,24 +1498,104 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
 
 	debug_hrtimer_assert_init(timer);
 
+	base = lock_hrtimer_base(timer, &flags);
+
+	switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) {
+	case HRTIMER_REPROGRAM:
+		hrtimer_reprogram(timer, true);
+		break;
+	case HRTIMER_REPROGRAM_FORCE:
+		hrtimer_force_reprogram(timer->base->cpu_base, 1);
+		break;
+	case HRTIMER_REPROGRAM_NONE:
+		break;
+	}
+
+	unlock_hrtimer_base(timer, &flags);
+}
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+static inline bool hrtimer_check_user_timer(struct hrtimer *timer)
+{
+	struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base;
+	ktime_t expires;
+
 	/*
-	 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
-	 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
-	 * expiry mode because unmarked timers are moved to softirq expiry.
+	 * This uses soft expires because that's the user provided
+	 * expiry time, while expires can be further in the past
+	 * due to a slack value added to the user expiry time.
 	 */
-	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
-		WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
-	else
-		WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
+	expires = hrtimer_get_softexpires(timer);
+
+	/* Convert to monotonic */
+	expires = ktime_sub(expires, timer->base->offset);
+
+	/*
+	 * Check whether this timer will end up as the first expiring timer in
+	 * the CPU base. If not, no further checks required as it's then
+	 * guaranteed to expire in the future.
+	 */
+	if (expires >= cpu_base->expires_next)
+		return true;
+
+	/* Validate that the expiry time is in the future. */
+	if (expires > ktime_get())
+		return true;
+
+	debug_hrtimer_deactivate(timer);
+	__remove_hrtimer(timer, timer->base, HRTIMER_STATE_INACTIVE, false);
+	trace_hrtimer_start_expired(timer);
+	return false;
+}
+
+/**
+ * hrtimer_start_range_ns_user - (re)start an user controlled hrtimer
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
+ * @mode:	timer mode: absolute (HRTIMER_MODE_ABS) or
+ *		relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
+ *		softirq based mode is considered for debug purpose only!
+ *
+ * Returns: True when the timer was queued, false if it was already expired
+ *
+ * This function cannot invoke the timer callback for expired timers as it might
+ * be called under a lock which the timer callback needs to acquire. So the
+ * caller has to handle that case.
+ */
+bool hrtimer_start_range_ns_user(struct hrtimer *timer, ktime_t tim,
+				 u64 delta_ns, const enum hrtimer_mode mode)
+{
+	struct hrtimer_clock_base *base;
+	unsigned long flags;
+	bool ret = true;
+
+	debug_hrtimer_assert_init(timer);
 
 	base = lock_hrtimer_base(timer, &flags);
 
-	if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
-		hrtimer_reprogram(timer, true);
+	switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) {
+	case HRTIMER_REPROGRAM:
+		ret = hrtimer_check_user_timer(timer);
+		if (ret)
+			hrtimer_reprogram(timer, true);
+		break;
+	case HRTIMER_REPROGRAM_FORCE:
+		ret = hrtimer_check_user_timer(timer);
+		/*
+		 * The base must always be reevaluated, independent of the
+		 * result above because the timer was the first pending timer.
+		 */
+		hrtimer_force_reprogram(timer->base->cpu_base, 1);
+		break;
+	case HRTIMER_REPROGRAM_NONE:
+		break;
+	}
 
 	unlock_hrtimer_base(timer, &flags);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns_user);
 
 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
@@ -1681,10 +1783,10 @@ EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
  *
  * Returns the next expiry time or KTIME_MAX if no timer is pending.
  */
-u64 hrtimer_get_next_event(void)
+ktime_t hrtimer_get_next_event(void)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	u64 expires = KTIME_MAX;
+	ktime_t expires = KTIME_MAX;
 
 	guard(raw_spinlock_irqsave)(&cpu_base->lock);
 	if (!hrtimer_hres_active(cpu_base))
@@ -1700,10 +1802,10 @@ u64 hrtimer_get_next_event(void)
  * Returns the next expiry time over all timers except for the @exclude one or
  * KTIME_MAX if none of them is pending.
  */
-u64 hrtimer_next_event_without(const struct hrtimer *exclude)
+ktime_t hrtimer_next_event_without(const struct hrtimer *exclude)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	u64 expires = KTIME_MAX;
+	ktime_t expires = KTIME_MAX;
 	unsigned int active;
 
 	guard(raw_spinlock_irqsave)(&cpu_base->lock);
@@ -2213,7 +2315,11 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode
 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
 		mode |= HRTIMER_MODE_HARD;
 
-	hrtimer_start_expires(&sl->timer, mode);
+	/* If already expired, clear the task pointer and set current state to running */
+	if (!hrtimer_start_expires_user(&sl->timer, mode)) {
+		sl->task = NULL;
+		__set_current_state(TASK_RUNNING);
+	}
 }
 EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
 
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 1c954f330dfe..d51428867a33 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -60,15 +60,14 @@ EXPORT_SYMBOL(get_jiffies_64);
 
 EXPORT_SYMBOL(jiffies);
 
-static int __init init_jiffies_clocksource(void)
-{
-	return __clocksource_register(&clocksource_jiffies);
-}
-
-core_initcall(init_jiffies_clocksource);
+static bool cs_jiffies_registered __initdata;
 
 struct clocksource * __init __weak clocksource_default_clock(void)
 {
+	if (!cs_jiffies_registered) {
+		__clocksource_register(&clocksource_jiffies);
+		cs_jiffies_registered = true;
+	}
 	return &clocksource_jiffies;
 }
 
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 4bca3f78c8ea..5fa0af66cf3f 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -57,6 +57,7 @@ ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
 
 	return tim;
 }
+EXPORT_SYMBOL_GPL(do_timens_ktime_to_host);
 
 static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
 {
@@ -351,6 +352,7 @@ struct time_namespace init_time_ns = {
 	.user_ns	= &init_user_ns,
 	.frozen_offsets	= true,
 };
+EXPORT_SYMBOL_GPL(init_time_ns);
 
 void __init time_ns_init(void)
 {
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0de2bb7cbec0..74775b94d11b 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -19,7 +19,7 @@
 
 #include "posix-timers.h"
 
-static void posix_cpu_timer_rearm(struct k_itimer *timer);
+static bool posix_cpu_timer_rearm(struct k_itimer *timer);
 
 void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
 {
@@ -1011,24 +1011,27 @@ static void check_process_timers(struct task_struct *tsk,
 /*
  * This is called from the signal code (via posixtimer_rearm)
  * when the last timer signal was delivered and we have to reload the timer.
+ *
+ * Return true unconditionally so the core code assumes the timer to be
+ * armed. Otherwise it would requeue the signal.
  */
-static void posix_cpu_timer_rearm(struct k_itimer *timer)
+static bool posix_cpu_timer_rearm(struct k_itimer *timer)
 {
 	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
-	struct task_struct *p;
 	struct sighand_struct *sighand;
+	struct task_struct *p;
 	unsigned long flags;
 	u64 now;
 
-	rcu_read_lock();
+	guard(rcu)();
 	p = cpu_timer_task_rcu(timer);
 	if (!p)
-		goto out;
+		return true;
 
 	/* Protect timer list r/w in arm_timer() */
 	sighand = lock_task_sighand(p, &flags);
 	if (unlikely(sighand == NULL))
-		goto out;
+		return true;
 
 	/*
 	 * Fetch the current sample and update the timer's expiry time.
@@ -1045,8 +1048,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
 	 */
 	arm_timer(timer, p);
 	unlock_task_sighand(p, &flags);
-out:
-	rcu_read_unlock();
+	return true;
 }
 
 /**
@@ -1504,6 +1506,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		spin_lock_irq(&timer.it_lock);
 		error = posix_cpu_timer_set(&timer, flags, &it, NULL);
 		if (error) {
+			posix_cpu_timer_del(&timer);
 			spin_unlock_irq(&timer.it_lock);
 			return error;
 		}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 9331e1614124..436ba794cc0b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -288,16 +288,18 @@ static inline int timer_overrun_to_int(struct k_itimer *timr)
 	return (int)timr->it_overrun_last;
 }
 
-static void common_hrtimer_rearm(struct k_itimer *timr)
+static bool common_hrtimer_rearm(struct k_itimer *timr)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
 
 	timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval);
-	hrtimer_restart(timer);
+	return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS);
 }
 
 static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr)
 {
+	bool queued;
+
 	guard(spinlock)(&timr->it_lock);
 
 	/*
@@ -311,12 +313,18 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it
 	if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
 		return true;
 
-	timr->kclock->timer_rearm(timr);
-	timr->it_status = POSIX_TIMER_ARMED;
+	/* timer_rearm() updates timr::it_overrun */
+	queued = timr->kclock->timer_rearm(timr);
+
 	timr->it_overrun_last = timr->it_overrun;
 	timr->it_overrun = -1LL;
 	++timr->it_signal_seq;
 	info->si_overrun = timer_overrun_to_int(timr);
+
+	if (queued)
+		timr->it_status = POSIX_TIMER_ARMED;
+	else
+		posix_timer_queue_signal(timr);
 	return true;
 }
 
@@ -795,7 +803,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 		return timer_overrun_to_int(scoped_timer);
 }
 
-static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
+static bool common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
 			       bool absolute, bool sigev_none)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
@@ -820,8 +828,11 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
 		expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer));
 	hrtimer_set_expires(timer, expires);
 
-	if (!sigev_none)
-		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+	/* For sigev_none pretend that the timer is queued */
+	if (sigev_none)
+		return true;
+
+	return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS);
 }
 
 static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
@@ -903,9 +914,13 @@ int common_timer_set(struct k_itimer *timr, int flags,
 		expires = timens_ktime_to_host(timr->it_clock, expires);
 	sigev_none = timr->it_sigev_notify == SIGEV_NONE;
 
-	kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
-	if (!sigev_none)
-		timr->it_status = POSIX_TIMER_ARMED;
+	if (kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none)) {
+		if (!sigev_none)
+			timr->it_status = POSIX_TIMER_ARMED;
+	} else {
+		/* Timer was already expired, queue the signal */
+		posix_timer_queue_signal(timr);
+	}
 	return 0;
 }
 
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 7f259e845d24..4ea9611dd716 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -27,11 +27,11 @@ struct k_clock {
 	int	(*timer_del)(struct k_itimer *timr);
 	void	(*timer_get)(struct k_itimer *timr,
 			     struct itimerspec64 *cur_setting);
-	void	(*timer_rearm)(struct k_itimer *timr);
+	bool	(*timer_rearm)(struct k_itimer *timr);
 	s64	(*timer_forward)(struct k_itimer *timr, ktime_t now);
 	ktime_t	(*timer_remaining)(struct k_itimer *timr, ktime_t now);
 	int	(*timer_try_to_cancel)(struct k_itimer *timr);
-	void	(*timer_arm)(struct k_itimer *timr, ktime_t expires,
+	bool	(*timer_arm)(struct k_itimer *timr, ktime_t expires,
 			     bool absolute, bool sigev_none);
 	void	(*timer_wait_running)(struct k_itimer *timr);
 };
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cbbb87a0c6e7..3026a301dff7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1407,8 +1407,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 	 * If the next highres timer to expire is earlier than 'next_event', the
 	 * idle governor needs to know that.
 	 */
-	next_event = min_t(u64, next_event,
-			   hrtimer_next_event_without(&ts->sched_timer));
+	next_event = min(next_event, hrtimer_next_event_without(&ts->sched_timer));
 
 	return ktime_sub(next_event, now);
 }
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 04d928c21aba..655a8c6cd84d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1932,7 +1932,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
  */
 static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
 {
-	u64 nextevt = hrtimer_get_next_event();
+	u64 nextevt = ktime_to_ns(hrtimer_get_next_event());
 
 	/*
 	 * If high resolution timers are enabled
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 52c15affdbff..806c23cf71fc 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -102,7 +102,7 @@
  * active CPU/group information atomic_try_cmpxchg() is used instead and only
  * the per CPU tmigr_cpu->lock is held.
  *
- * During the setup of groups tmigr_level_list is required. It is protected by
+ * During the setup of groups, hier->level_list is required. It is protected by
  * @tmigr_mutex.
  *
  * When @timer_base->lock as well as tmigr related locks are required, the lock
@@ -416,13 +416,12 @@
  */
 
 static DEFINE_MUTEX(tmigr_mutex);
-static struct list_head *tmigr_level_list __read_mostly;
+
+static LIST_HEAD(tmigr_hierarchy_list);
 
 static unsigned int tmigr_hierarchy_levels __read_mostly;
 static unsigned int tmigr_crossnode_level __read_mostly;
 
-static struct tmigr_group *tmigr_root;
-
 static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
 
 /*
@@ -1469,6 +1468,34 @@ static long tmigr_trigger_active(void *unused)
 	return 0;
 }
 
+static unsigned int tmigr_get_capacity(int cpu)
+{
+	/*
+	 * nohz_full CPUs need to make sure there is always an available (online)
+	 * and never idle migrator to handle all their global timers. That duty
+	 * is served by the timekeeper which then never stops its tick. But the
+	 * timekeeper must then belong to the same hierarchy as all the nohz_full
+	 * CPUs. Simply turn off capacity awareness when nohz_full is running.
+	 */
+	if (tick_nohz_full_enabled() || !IS_ENABLED(CONFIG_BROKEN))
+		return SCHED_CAPACITY_SCALE;
+	else
+		return arch_scale_cpu_capacity(cpu);
+}
+
+static struct tmigr_hierarchy *__tmigr_get_hierarchy(int cpu)
+{
+	unsigned int capacity = tmigr_get_capacity(cpu);
+	struct tmigr_hierarchy *iter;
+
+	list_for_each_entry(iter, &tmigr_hierarchy_list, node) {
+		if (iter->capacity == capacity)
+			return iter;
+	}
+
+	return NULL;
+}
+
 static int tmigr_clear_cpu_available(unsigned int cpu)
 {
 	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1493,8 +1520,21 @@ static int tmigr_clear_cpu_available(unsigned int cpu)
 	}
 
 	if (firstexp != KTIME_MAX) {
-		migrator = cpumask_any(tmigr_available_cpumask);
-		work_on_cpu(migrator, tmigr_trigger_active, NULL);
+		struct tmigr_hierarchy *hier = __tmigr_get_hierarchy(cpu);
+
+		if (WARN_ON_ONCE(!hier))
+			return -EINVAL;
+
+		migrator = cpumask_any_and(tmigr_available_cpumask, hier->cpumask);
+		if (migrator < nr_cpu_ids) {
+			work_on_cpu(migrator, tmigr_trigger_active, NULL);
+		} else {
+			/*
+			 * If deactivation returned an expiration, it belongs to an available
+			 * nohz CPU in the hierarchy.
+			 */
+			WARN_ONCE(1, "Expected available CPU in the hierarchy\n");
+		}
 	}
 
 	return 0;
@@ -1657,14 +1697,14 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
 	group->groupevt.ignore = true;
 }
 
-static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int node, unsigned int lvl)
 {
 	struct tmigr_group *tmp, *group = NULL;
 
 	lockdep_assert_held(&tmigr_mutex);
 
 	/* Try to attach to an existing group first */
-	list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
+	list_for_each_entry(tmp, &hier->level_list[lvl], list) {
 		/*
 		 * If @lvl is below the cross NUMA node level, check whether
 		 * this group belongs to the same NUMA node.
@@ -1698,14 +1738,14 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
 	tmigr_init_group(group, lvl, node);
 
 	/* Setup successful. Add it to the hierarchy */
-	list_add(&group->list, &tmigr_level_list[lvl]);
+	list_add(&group->list, &hier->level_list[lvl]);
 	trace_tmigr_group_set(group);
 	return group;
 }
 
-static bool tmigr_init_root(struct tmigr_group *group, bool activate)
+static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *group, bool activate)
 {
-	if (!group->parent && group != tmigr_root) {
+	if (!group->parent && group != hier->root) {
 		/*
 		 * This is the new top-level, prepare its groupmask in advance
 		 * to avoid accidents where yet another new top-level is
@@ -1721,11 +1761,10 @@ static bool tmigr_init_root(struct tmigr_group *group, bool activate)
 
 }
 
-static void tmigr_connect_child_parent(struct tmigr_group *child,
-				       struct tmigr_group *parent,
-				       bool activate)
+static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmigr_group *child,
+				       struct tmigr_group *parent, bool activate)
 {
-	if (tmigr_init_root(parent, activate)) {
+	if (tmigr_init_root(hier, parent, activate)) {
 		/*
 		 * The previous top level had prepared its groupmask already,
 		 * simply account it in advance as the first child. If some groups
@@ -1758,13 +1797,13 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 	 */
 	smp_store_release(&child->parent, parent);
 
-	trace_tmigr_connect_child_parent(child);
+	trace_tmigr_connect_child_parent(hier, child);
 }
 
-static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
-			      struct tmigr_group *start, bool activate)
+static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
+			      unsigned int node, struct tmigr_group *start, bool activate)
 {
-	struct tmigr_group *group, *child, **stack;
+	struct tmigr_group *root = hier->root, *group, *child, **stack;
 	int i, top = 0, err = 0, start_lvl = 0;
 	bool root_mismatch = false;
 
@@ -1777,11 +1816,11 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 		start_lvl = start->level + 1;
 	}
 
-	if (tmigr_root)
-		root_mismatch = tmigr_root->numa_node != node;
+	if (root)
+		root_mismatch = root->numa_node != node;
 
 	for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
-		group = tmigr_get_group(node, i);
+		group = tmigr_get_group(hier, node, i);
 		if (IS_ERR(group)) {
 			err = PTR_ERR(group);
 			i--;
@@ -1803,7 +1842,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 		if (group->parent)
 			break;
 		if ((!root_mismatch || i >= tmigr_crossnode_level) &&
-		    list_is_singular(&tmigr_level_list[i]))
+		    list_is_singular(&hier->level_list[i]))
 			break;
 	}
 
@@ -1831,15 +1870,15 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 			tmc->tmgroup = group;
 			tmc->groupmask = BIT(group->num_children++);
 
-			tmigr_init_root(group, activate);
+			tmigr_init_root(hier, group, activate);
 
-			trace_tmigr_connect_cpu_parent(tmc);
+			trace_tmigr_connect_cpu_parent(hier, tmc);
 
 			/* There are no children that need to be connected */
 			continue;
 		} else {
 			child = stack[i - 1];
-			tmigr_connect_child_parent(child, group, activate);
+			tmigr_connect_child_parent(hier, child, group, activate);
 		}
 	}
 
@@ -1895,18 +1934,23 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 			data.childmask = start->groupmask;
 			__walk_groups_from(tmigr_active_up, &data, start, start->parent);
 		}
+	} else if (start) {
+		union tmigr_state state;
+
+		/* Remote activation assumes the whole target's hierarchy is inactive */
+		state.state = atomic_read(&start->migr_state);
+		WARN_ON_ONCE(state.active);
 	}
 
 	/* Root update */
-	if (list_is_singular(&tmigr_level_list[top])) {
-		group = list_first_entry(&tmigr_level_list[top],
-					 typeof(*group), list);
+	if (list_is_singular(&hier->level_list[top])) {
+		group = list_first_entry(&hier->level_list[top], typeof(*group), list);
 		WARN_ON_ONCE(group->parent);
-		if (tmigr_root) {
+		if (root) {
 			/* Old root should be the same or below */
-			WARN_ON_ONCE(tmigr_root->level > top);
+			WARN_ON_ONCE(root->level > top);
 		}
-		tmigr_root = group;
+		hier->root = group;
 	}
 out:
 	kfree(stack);
@@ -1914,34 +1958,123 @@ out:
 	return err;
 }
 
+static struct tmigr_hierarchy *tmigr_get_hierarchy(int cpu)
+{
+	struct tmigr_hierarchy *hier;
+
+	hier = __tmigr_get_hierarchy(cpu);
+
+	if (hier)
+		return hier;
+
+	hier = kzalloc_flex(*hier, level_list, tmigr_hierarchy_levels);
+	if (!hier)
+		return ERR_PTR(-ENOMEM);
+
+	hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
+	if (!hier->cpumask) {
+		kfree(hier);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (int i = 0; i < tmigr_hierarchy_levels; i++)
+		INIT_LIST_HEAD(&hier->level_list[i]);
+
+	hier->capacity = tmigr_get_capacity(cpu);
+	list_add_tail(&hier->node, &tmigr_hierarchy_list);
+
+	return hier;
+}
+
+static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu,
+				  struct tmigr_group *old_root,	bool activate)
+{
+	/*
+	 * The target CPU must never do the prepare work, except
+	 * on early boot when the boot CPU is the target. Otherwise
+	 * it may spuriously activate the old top level group inside
+	 * the new one (nevertheless whether old top level group is
+	 * active or not) and/or release an uninitialized childmask.
+	 */
+	WARN_ON_ONCE(cpu == smp_processor_id());
+	if (activate) {
+		/*
+		 * The current CPU is expected to be online in the hierarchy,
+		 * otherwise the old root may not be active as expected.
+		 */
+		WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
+	}
+
+	return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate);
+}
+
+static long connect_old_root_work(void *arg)
+{
+	struct tmigr_group *old_root = arg;
+	struct tmigr_hierarchy *hier;
+	int cpu = smp_processor_id();
+
+	hier = __tmigr_get_hierarchy(cpu);
+	if (WARN_ON_ONCE(!hier))
+		return -EINVAL;
+
+	return tmigr_connect_old_root(hier, cpu, old_root, true);
+}
+
 static int tmigr_add_cpu(unsigned int cpu)
 {
-	struct tmigr_group *old_root = tmigr_root;
+	struct tmigr_hierarchy *hier;
+	struct tmigr_group *old_root;
 	int node = cpu_to_node(cpu);
 	int ret;
 
 	guard(mutex)(&tmigr_mutex);
 
-	ret = tmigr_setup_groups(cpu, node, NULL, false);
+	hier = tmigr_get_hierarchy(cpu);
+	if (IS_ERR(hier))
+		return PTR_ERR(hier);
+
+	old_root = hier->root;
+
+	ret = tmigr_setup_groups(hier, cpu, node, NULL, false);
+
+	if (ret < 0)
+		return ret;
 
 	/* Root has changed? Connect the old one to the new */
-	if (ret >= 0 && old_root && old_root != tmigr_root) {
-		/*
-		 * The target CPU must never do the prepare work, except
-		 * on early boot when the boot CPU is the target. Otherwise
-		 * it may spuriously activate the old top level group inside
-		 * the new one (nevertheless whether old top level group is
-		 * active or not) and/or release an uninitialized childmask.
-		 */
-		WARN_ON_ONCE(cpu == raw_smp_processor_id());
-		/*
-		 * The (likely) current CPU is expected to be online in the hierarchy,
-		 * otherwise the old root may not be active as expected.
-		 */
-		WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
-		ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
+	if (old_root && old_root != hier->root) {
+		guard(migrate)();
+
+		if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) {
+			/*
+			 * If the target belong to the same hierarchy, the old root is expected
+			 * to be active. Link and propagate to the new root.
+			 */
+			ret = tmigr_connect_old_root(hier, cpu, old_root, true);
+		} else {
+			int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask);
+
+			if (target < nr_cpu_ids) {
+				/*
+				 * If the target doesn't belong to the same hierarchy as the current
+				 * CPU, activate from a relevant one to make sure the old root is
+				 * active.
+				 */
+				ret = work_on_cpu(target, connect_old_root_work, old_root);
+			} else {
+				/*
+				 * No other available CPUs in the remote hierarchy. Link the
+				 * old root remotely but don't propagate activation since the
+				 * old root is not expected to be active.
+				 */
+				ret = tmigr_connect_old_root(hier, cpu, old_root, false);
+			}
+		}
 	}
 
+	if (ret >= 0)
+		cpumask_set_cpu(cpu, hier->cpumask);
+
 	return ret;
 }
 
@@ -1974,7 +2107,7 @@ static int tmigr_cpu_prepare(unsigned int cpu)
 
 static int __init tmigr_init(void)
 {
-	unsigned int cpulvl, nodelvl, cpus_per_node, i;
+	unsigned int cpulvl, nodelvl, cpus_per_node;
 	unsigned int nnodes = num_possible_nodes();
 	unsigned int ncpus = num_possible_cpus();
 	int ret = -ENOMEM;
@@ -2021,14 +2154,6 @@ static int __init tmigr_init(void)
 	 */
 	tmigr_crossnode_level = cpulvl;
 
-	tmigr_level_list = kzalloc_objs(struct list_head,
-					tmigr_hierarchy_levels);
-	if (!tmigr_level_list)
-		goto err;
-
-	for (i = 0; i < tmigr_hierarchy_levels; i++)
-		INIT_LIST_HEAD(&tmigr_level_list[i]);
-
 	pr_info("Timer migration: %d hierarchy levels; %d children per group;"
 		" %d crossnode level\n",
 		tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 70879cde6fdd..31735dd52327 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -6,6 +6,24 @@
 #define TMIGR_CHILDREN_PER_GROUP 8
 
 /**
+ * struct tmigr_hierarchy - a hierarchy associated to a given CPU capacity.
+ *                          Homogeneous systems have only one hierarchy.
+ *                          Heterogenous have one hierarchy per CPU capacity.
+ * @cpumask:	CPUs belonging to this hierarchy
+ * @root:	The current root of the hierarchy
+ * @capacity:	CPU capacity associated to this hierarchy
+ * @node:	Node in the global hierarchy list
+ * @level_list:	Per level lists of tmigr groups
+ */
+struct tmigr_hierarchy {
+	struct cpumask		*cpumask;
+	struct tmigr_group	*root;
+	unsigned long		capacity;
+	struct list_head	node;
+	struct list_head	level_list[];
+};
+
+/**
  * struct tmigr_event - a timer event associated to a CPU
  * @nextevt:	The node to enqueue an event in the parent group queue
  * @cpu:	The CPU to which this event belongs
@@ -75,15 +93,17 @@ struct tmigr_group {
 /**
  * struct tmigr_cpu - timer migration per CPU group
  * @lock:		Lock protecting the tmigr_cpu group information
- * @online:		Indicates whether the CPU is online; In deactivate path
- *			it is required to know whether the migrator in the top
- *			level group is to be set offline, while a timer is
- *			pending. Then another online CPU needs to be notified to
- *			take over the migrator role. Furthermore the information
- *			is required in CPU hotplug path as the CPU is able to go
- *			idle before the timer migration hierarchy hotplug AP is
- *			reached. During this phase, the CPU has to handle the
+ * @available:		Indicates whether the CPU is available for handling
+ *			global timers. In the deactivate path it is required to
+ *			know whether the migrator in the top level group is to
+ *			be set offline, while a timer is pending. Then another
+ *			available CPU needs to be notified to take over the
+ *			migrator role. Furthermore the information is required
+ *			in the CPU hotplug path as the CPU is able to go idle
+ *			before the timer migration hierarchy hotplug callback is
+ *			reached.  During this phase, the CPU has to handle the
  *			global timers on its own and must not act as a migrator.
+
  * @idle:		Indicates whether the CPU is idle in the timer migration
  *			hierarchy
  * @remote:		Is set when timers of the CPU are expired remotely
author	Linus Torvalds <torvalds@linux-foundation.org>	2026-06-15 13:39:12 +0530
committer	Linus Torvalds <torvalds@linux-foundation.org>	2026-06-15 13:39:12 +0530
commit	a60ce761d99ff2d9eefe33374c5f20726465a140 (patch)
tree	a7883dcce89453fe59d7cf614620e0b20bea3895 /kernel/time
parent	f20e2fdaaeb74330a6c5d65af22a8c47409a7a91 (diff)
parent	87bd2ad568e15b90d5f7d4bcd70342d05dad649c (diff)