Merge tag 'timers-core-2026-06-13' of gitolite.kernel.org:pub/scm/linux/kernel/git/tip/tip

Pull timer core updates from Thomas Gleixner: "Updates for the time/timer core subsystem: - Harden the user space controllable hrtimer interfaces further to protect against unpriviledged DoS attempts by arming timers in the past. - Add per-capacity hierarchies to the timer migration code to prevent timer migration accross different capacity domains. This code has been disabled last minute as there is a pathological problem with SoCs which advertise a larger number of capacity domains. The problem is under investigation and the code won't be active before v7.3, but that turned out to be less intrusive than a full revert as it preserves the preparatory steps and allows people to work on the final resolution - Export time namespace functionality as a recent user can be built as a module. - Initialize the jiffies clocksource before using it. The recent hardening against time moving backward requires that the related members of struct clocksource have been initialized, otherwise it clamps the readout to 0, which makes time stand sill and causes boot delays. - Fix a more than twenty year old PID reference count leak in an error path of the POSIX CPU timer code. - The usual small fixes, improvements and cleanups all over the place" * tag 'timers-core-2026-06-13' of gitolite.kernel.org:pub/scm/linux/kernel/git/tip/tip: (31 commits) posix-cpu-timers: Fix pid refcount leak in do_cpu_nanosleep() error path time/jiffies: Register jiffies clocksource before usage timers/migration: Temporarily disable per capacity hierarchies timers/migration: Turn tmigr_hierarchy level_list into a flexible array timers/migration: Deactivate per-capacity hierarchies under nohz_full timers/migration: Fix hotplug migrator selection target on asymetric capacity machines ntsync: Honour caller's time namespace for absolute MONOTONIC timeouts time/namespace: Export init_time_ns and do_timens_ktime_to_host() timers/migration: Update stale @online doc to @available timers: Fix flseep() typo in kernel-doc comment hrtimer: Fix the bogus return type of __hrtimer_start_range_ns() hrtimer: Return ktime_t from hrtimer_get_next_event()/hrtimer_next_event_without() clocksource: Clean up clocksource_update_freq() functions alarmtimer: Remove stale return description from alarm_handle_timer() selftests/posix_timers: Use CLOCK_THREAD_CPUTIME_ID for ITIMER_PROF measurements scripts/timers: Add timer_migration_tree.py timers/migration: Handle capacity in connect tracepoints timers/migration: Split per-capacity hierarchies timers/migration: Track CPUs in a hierarchy timers/migration: Abstract out hierarchy to prepare for CPU capacity awareness ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2026-06-15 13:39:12 +0530
committer: Linus Torvalds <torvalds@linux-foundation.org> 2026-06-15 13:39:12 +0530
commit: a60ce761d99ff2d9eefe33374c5f20726465a140 (patch)
tree: a7883dcce89453fe59d7cf614620e0b20bea3895 /kernel/time/timer_migration.c
parent: f20e2fdaaeb74330a6c5d65af22a8c47409a7a91 (diff)
parent: 87bd2ad568e15b90d5f7d4bcd70342d05dad649c (diff)
1 files changed, 183 insertions, 58 deletions
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 52c15affdbff..806c23cf71fc 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -102,7 +102,7 @@
  * active CPU/group information atomic_try_cmpxchg() is used instead and only
  * the per CPU tmigr_cpu->lock is held.
  *
- * During the setup of groups tmigr_level_list is required. It is protected by
+ * During the setup of groups, hier->level_list is required. It is protected by
  * @tmigr_mutex.
  *
  * When @timer_base->lock as well as tmigr related locks are required, the lock
@@ -416,13 +416,12 @@
  */
 
 static DEFINE_MUTEX(tmigr_mutex);
-static struct list_head *tmigr_level_list __read_mostly;
+
+static LIST_HEAD(tmigr_hierarchy_list);
 
 static unsigned int tmigr_hierarchy_levels __read_mostly;
 static unsigned int tmigr_crossnode_level __read_mostly;
 
-static struct tmigr_group *tmigr_root;
-
 static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
 
 /*
@@ -1469,6 +1468,34 @@ static long tmigr_trigger_active(void *unused)
 	return 0;
 }
 
+static unsigned int tmigr_get_capacity(int cpu)
+{
+	/*
+	 * nohz_full CPUs need to make sure there is always an available (online)
+	 * and never idle migrator to handle all their global timers. That duty
+	 * is served by the timekeeper which then never stops its tick. But the
+	 * timekeeper must then belong to the same hierarchy as all the nohz_full
+	 * CPUs. Simply turn off capacity awareness when nohz_full is running.
+	 */
+	if (tick_nohz_full_enabled() || !IS_ENABLED(CONFIG_BROKEN))
+		return SCHED_CAPACITY_SCALE;
+	else
+		return arch_scale_cpu_capacity(cpu);
+}
+
+static struct tmigr_hierarchy *__tmigr_get_hierarchy(int cpu)
+{
+	unsigned int capacity = tmigr_get_capacity(cpu);
+	struct tmigr_hierarchy *iter;
+
+	list_for_each_entry(iter, &tmigr_hierarchy_list, node) {
+		if (iter->capacity == capacity)
+			return iter;
+	}
+
+	return NULL;
+}
+
 static int tmigr_clear_cpu_available(unsigned int cpu)
 {
 	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1493,8 +1520,21 @@ static int tmigr_clear_cpu_available(unsigned int cpu)
 	}
 
 	if (firstexp != KTIME_MAX) {
-		migrator = cpumask_any(tmigr_available_cpumask);
-		work_on_cpu(migrator, tmigr_trigger_active, NULL);
+		struct tmigr_hierarchy *hier = __tmigr_get_hierarchy(cpu);
+
+		if (WARN_ON_ONCE(!hier))
+			return -EINVAL;
+
+		migrator = cpumask_any_and(tmigr_available_cpumask, hier->cpumask);
+		if (migrator < nr_cpu_ids) {
+			work_on_cpu(migrator, tmigr_trigger_active, NULL);
+		} else {
+			/*
+			 * If deactivation returned an expiration, it belongs to an available
+			 * nohz CPU in the hierarchy.
+			 */
+			WARN_ONCE(1, "Expected available CPU in the hierarchy\n");
+		}
 	}
 
 	return 0;
@@ -1657,14 +1697,14 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
 	group->groupevt.ignore = true;
 }
 
-static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int node, unsigned int lvl)
 {
 	struct tmigr_group *tmp, *group = NULL;
 
 	lockdep_assert_held(&tmigr_mutex);
 
 	/* Try to attach to an existing group first */
-	list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
+	list_for_each_entry(tmp, &hier->level_list[lvl], list) {
 		/*
 		 * If @lvl is below the cross NUMA node level, check whether
 		 * this group belongs to the same NUMA node.
@@ -1698,14 +1738,14 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
 	tmigr_init_group(group, lvl, node);
 
 	/* Setup successful. Add it to the hierarchy */
-	list_add(&group->list, &tmigr_level_list[lvl]);
+	list_add(&group->list, &hier->level_list[lvl]);
 	trace_tmigr_group_set(group);
 	return group;
 }
 
-static bool tmigr_init_root(struct tmigr_group *group, bool activate)
+static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *group, bool activate)
 {
-	if (!group->parent && group != tmigr_root) {
+	if (!group->parent && group != hier->root) {
 		/*
 		 * This is the new top-level, prepare its groupmask in advance
 		 * to avoid accidents where yet another new top-level is
@@ -1721,11 +1761,10 @@ static bool tmigr_init_root(struct tmigr_group *group, bool activate)
 
 }
 
-static void tmigr_connect_child_parent(struct tmigr_group *child,
-				       struct tmigr_group *parent,
-				       bool activate)
+static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmigr_group *child,
+				       struct tmigr_group *parent, bool activate)
 {
-	if (tmigr_init_root(parent, activate)) {
+	if (tmigr_init_root(hier, parent, activate)) {
 		/*
 		 * The previous top level had prepared its groupmask already,
 		 * simply account it in advance as the first child. If some groups
@@ -1758,13 +1797,13 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 	 */
 	smp_store_release(&child->parent, parent);
 
-	trace_tmigr_connect_child_parent(child);
+	trace_tmigr_connect_child_parent(hier, child);
 }
 
-static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
-			      struct tmigr_group *start, bool activate)
+static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu,
+			      unsigned int node, struct tmigr_group *start, bool activate)
 {
-	struct tmigr_group *group, *child, **stack;
+	struct tmigr_group *root = hier->root, *group, *child, **stack;
 	int i, top = 0, err = 0, start_lvl = 0;
 	bool root_mismatch = false;
 
@@ -1777,11 +1816,11 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 		start_lvl = start->level + 1;
 	}
 
-	if (tmigr_root)
-		root_mismatch = tmigr_root->numa_node != node;
+	if (root)
+		root_mismatch = root->numa_node != node;
 
 	for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
-		group = tmigr_get_group(node, i);
+		group = tmigr_get_group(hier, node, i);
 		if (IS_ERR(group)) {
 			err = PTR_ERR(group);
 			i--;
@@ -1803,7 +1842,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 		if (group->parent)
 			break;
 		if ((!root_mismatch || i >= tmigr_crossnode_level) &&
-		    list_is_singular(&tmigr_level_list[i]))
+		    list_is_singular(&hier->level_list[i]))
 			break;
 	}
 
@@ -1831,15 +1870,15 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 			tmc->tmgroup = group;
 			tmc->groupmask = BIT(group->num_children++);
 
-			tmigr_init_root(group, activate);
+			tmigr_init_root(hier, group, activate);
 
-			trace_tmigr_connect_cpu_parent(tmc);
+			trace_tmigr_connect_cpu_parent(hier, tmc);
 
 			/* There are no children that need to be connected */
 			continue;
 		} else {
 			child = stack[i - 1];
-			tmigr_connect_child_parent(child, group, activate);
+			tmigr_connect_child_parent(hier, child, group, activate);
 		}
 	}
 
@@ -1895,18 +1934,23 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 			data.childmask = start->groupmask;
 			__walk_groups_from(tmigr_active_up, &data, start, start->parent);
 		}
+	} else if (start) {
+		union tmigr_state state;
+
+		/* Remote activation assumes the whole target's hierarchy is inactive */
+		state.state = atomic_read(&start->migr_state);
+		WARN_ON_ONCE(state.active);
 	}
 
 	/* Root update */
-	if (list_is_singular(&tmigr_level_list[top])) {
-		group = list_first_entry(&tmigr_level_list[top],
-					 typeof(*group), list);
+	if (list_is_singular(&hier->level_list[top])) {
+		group = list_first_entry(&hier->level_list[top], typeof(*group), list);
 		WARN_ON_ONCE(group->parent);
-		if (tmigr_root) {
+		if (root) {
 			/* Old root should be the same or below */
-			WARN_ON_ONCE(tmigr_root->level > top);
+			WARN_ON_ONCE(root->level > top);
 		}
-		tmigr_root = group;
+		hier->root = group;
 	}
 out:
 	kfree(stack);
@@ -1914,34 +1958,123 @@ out:
 	return err;
 }
 
+static struct tmigr_hierarchy *tmigr_get_hierarchy(int cpu)
+{
+	struct tmigr_hierarchy *hier;
+
+	hier = __tmigr_get_hierarchy(cpu);
+
+	if (hier)
+		return hier;
+
+	hier = kzalloc_flex(*hier, level_list, tmigr_hierarchy_levels);
+	if (!hier)
+		return ERR_PTR(-ENOMEM);
+
+	hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
+	if (!hier->cpumask) {
+		kfree(hier);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (int i = 0; i < tmigr_hierarchy_levels; i++)
+		INIT_LIST_HEAD(&hier->level_list[i]);
+
+	hier->capacity = tmigr_get_capacity(cpu);
+	list_add_tail(&hier->node, &tmigr_hierarchy_list);
+
+	return hier;
+}
+
+static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu,
+				  struct tmigr_group *old_root,	bool activate)
+{
+	/*
+	 * The target CPU must never do the prepare work, except
+	 * on early boot when the boot CPU is the target. Otherwise
+	 * it may spuriously activate the old top level group inside
+	 * the new one (nevertheless whether old top level group is
+	 * active or not) and/or release an uninitialized childmask.
+	 */
+	WARN_ON_ONCE(cpu == smp_processor_id());
+	if (activate) {
+		/*
+		 * The current CPU is expected to be online in the hierarchy,
+		 * otherwise the old root may not be active as expected.
+		 */
+		WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available));
+	}
+
+	return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate);
+}
+
+static long connect_old_root_work(void *arg)
+{
+	struct tmigr_group *old_root = arg;
+	struct tmigr_hierarchy *hier;
+	int cpu = smp_processor_id();
+
+	hier = __tmigr_get_hierarchy(cpu);
+	if (WARN_ON_ONCE(!hier))
+		return -EINVAL;
+
+	return tmigr_connect_old_root(hier, cpu, old_root, true);
+}
+
 static int tmigr_add_cpu(unsigned int cpu)
 {
-	struct tmigr_group *old_root = tmigr_root;
+	struct tmigr_hierarchy *hier;
+	struct tmigr_group *old_root;
 	int node = cpu_to_node(cpu);
 	int ret;
 
 	guard(mutex)(&tmigr_mutex);
 
-	ret = tmigr_setup_groups(cpu, node, NULL, false);
+	hier = tmigr_get_hierarchy(cpu);
+	if (IS_ERR(hier))
+		return PTR_ERR(hier);
+
+	old_root = hier->root;
+
+	ret = tmigr_setup_groups(hier, cpu, node, NULL, false);
+
+	if (ret < 0)
+		return ret;
 
 	/* Root has changed? Connect the old one to the new */
-	if (ret >= 0 && old_root && old_root != tmigr_root) {
-		/*
-		 * The target CPU must never do the prepare work, except
-		 * on early boot when the boot CPU is the target. Otherwise
-		 * it may spuriously activate the old top level group inside
-		 * the new one (nevertheless whether old top level group is
-		 * active or not) and/or release an uninitialized childmask.
-		 */
-		WARN_ON_ONCE(cpu == raw_smp_processor_id());
-		/*
-		 * The (likely) current CPU is expected to be online in the hierarchy,
-		 * otherwise the old root may not be active as expected.
-		 */
-		WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
-		ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
+	if (old_root && old_root != hier->root) {
+		guard(migrate)();
+
+		if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) {
+			/*
+			 * If the target belong to the same hierarchy, the old root is expected
+			 * to be active. Link and propagate to the new root.
+			 */
+			ret = tmigr_connect_old_root(hier, cpu, old_root, true);
+		} else {
+			int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask);
+
+			if (target < nr_cpu_ids) {
+				/*
+				 * If the target doesn't belong to the same hierarchy as the current
+				 * CPU, activate from a relevant one to make sure the old root is
+				 * active.
+				 */
+				ret = work_on_cpu(target, connect_old_root_work, old_root);
+			} else {
+				/*
+				 * No other available CPUs in the remote hierarchy. Link the
+				 * old root remotely but don't propagate activation since the
+				 * old root is not expected to be active.
+				 */
+				ret = tmigr_connect_old_root(hier, cpu, old_root, false);
+			}
+		}
 	}
 
+	if (ret >= 0)
+		cpumask_set_cpu(cpu, hier->cpumask);
+
 	return ret;
 }
 
@@ -1974,7 +2107,7 @@ static int tmigr_cpu_prepare(unsigned int cpu)
 
 static int __init tmigr_init(void)
 {
-	unsigned int cpulvl, nodelvl, cpus_per_node, i;
+	unsigned int cpulvl, nodelvl, cpus_per_node;
 	unsigned int nnodes = num_possible_nodes();
 	unsigned int ncpus = num_possible_cpus();
 	int ret = -ENOMEM;
@@ -2021,14 +2154,6 @@ static int __init tmigr_init(void)
 	 */
 	tmigr_crossnode_level = cpulvl;
 
-	tmigr_level_list = kzalloc_objs(struct list_head,
-					tmigr_hierarchy_levels);
-	if (!tmigr_level_list)
-		goto err;
-
-	for (i = 0; i < tmigr_hierarchy_levels; i++)
-		INIT_LIST_HEAD(&tmigr_level_list[i]);
-
 	pr_info("Timer migration: %d hierarchy levels; %d children per group;"
 		" %d crossnode level\n",
 		tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
author	Linus Torvalds <torvalds@linux-foundation.org>	2026-06-15 13:39:12 +0530
committer	Linus Torvalds <torvalds@linux-foundation.org>	2026-06-15 13:39:12 +0530
commit	a60ce761d99ff2d9eefe33374c5f20726465a140 (patch)
tree	a7883dcce89453fe59d7cf614620e0b20bea3895 /kernel/time/timer_migration.c
parent	f20e2fdaaeb74330a6c5d65af22a8c47409a7a91 (diff)
parent	87bd2ad568e15b90d5f7d4bcd70342d05dad649c (diff)