From bd5956166d20adbde3af0f6f265dc2f0ce5f4df9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2026 13:53:46 +0200 Subject: hrtimer: Provide hrtimer_start_range_ns_user() Calvin reported an odd NMI watchdog lockup which claims that the CPU locked up in user space. He provided a reproducer, which set's up a timerfd based timer and then rearms it in a loop with an absolute expiry time of 1ns. As the expiry time is in the past, the timer ends up as the first expiring timer in the per CPU hrtimer base and the clockevent device is programmed with the minimum delta value. If the machine is fast enough, this ends up in a endless loop of programming the delta value to the minimum value defined by the clock event device, before the timer interrupt can fire, which starves the interrupt and consequently triggers the lockup detector because the hrtimer callback of the lockup mechanism is never invoked. The clockevents code already has a last resort mechanism to prevent that, but it's sensible to catch such issues before trying to reprogram the clock event device. Provide a variant of hrtimer_start_range_ns(), which sanity checks the timer after queueing it. It does not so before because the timer might be armed and therefore needs to be dequeued. also we optimize for the latest possible point to check, so that the clock event prevention is avoided as much as possible. If the timer is already expired _before_ the clock event is reprogrammed, remove the timer from the queue and signal to the caller that the operation failed by returning false. That allows the caller to take immediate action without going through the loops and hoops of the hrtimer interrupt. The queueing code can't invoke the timer callback as the caller might hold a lock which is taken in the callback. Add a tracepoint which allows to analyze the expired at start situation. Reported-by: Calvin Owens Signed-off-by: Thomas Gleixner Tested-by: Calvin Owens Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260408114951.995031895@kernel.org --- include/trace/events/timer.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 07cbb9836b91..ca82fd62dc30 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -298,6 +298,19 @@ DECLARE_EVENT_CLASS(hrtimer_class, TP_printk("hrtimer=%p", __entry->hrtimer) ); +/** + * hrtimer_start_expired - Invoked when a expired timer was started + * @hrtimer: pointer to struct hrtimer + * + * Preceeded by a hrtimer_start tracepoint. + */ +DEFINE_EVENT(hrtimer_class, hrtimer_start_expired, + + TP_PROTO(struct hrtimer *hrtimer), + + TP_ARGS(hrtimer) +); + /** * hrtimer_expire_exit - called immediately after the hrtimer callback returns * @hrtimer: pointer to struct hrtimer -- cgit v1.2.3 From 5a7dfbcbbdb683e6f704966e73c02f4ba8eb6014 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 23 Apr 2026 18:53:53 +0200 Subject: timers/migration: Handle capacity in connect tracepoints This let tracers know to which hierarchy a CPU belongs to. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260423165354.95152-6-frederic@kernel.org --- include/trace/events/timer_migration.h | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h index 61171b13c687..0b135e9301b1 100644 --- a/include/trace/events/timer_migration.h +++ b/include/trace/events/timer_migration.h @@ -33,15 +33,16 @@ TRACE_EVENT(tmigr_group_set, TRACE_EVENT(tmigr_connect_child_parent, - TP_PROTO(struct tmigr_group *child), + TP_PROTO(struct tmigr_hierarchy *hier, struct tmigr_group *child), - TP_ARGS(child), + TP_ARGS(hier, child), TP_STRUCT__entry( __field( void *, child ) __field( void *, parent ) __field( unsigned int, lvl ) __field( unsigned int, numa_node ) + __field( unsigned int, capacity ) __field( unsigned int, num_children ) __field( u32, groupmask ) ), @@ -51,26 +52,28 @@ TRACE_EVENT(tmigr_connect_child_parent, __entry->parent = child->parent; __entry->lvl = child->parent->level; __entry->numa_node = child->parent->numa_node; + __entry->capacity = hier->capacity; __entry->num_children = child->parent->num_children; __entry->groupmask = child->groupmask; ), - TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d", - __entry->child, __entry->groupmask, __entry->parent, - __entry->lvl, __entry->numa_node, __entry->num_children) + TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d capacity=%d num_children=%d", + __entry->child, __entry->groupmask, __entry->parent, __entry->lvl, + __entry->numa_node, __entry->capacity, __entry->num_children) ); TRACE_EVENT(tmigr_connect_cpu_parent, - TP_PROTO(struct tmigr_cpu *tmc), + TP_PROTO(struct tmigr_hierarchy *hier, struct tmigr_cpu *tmc), - TP_ARGS(tmc), + TP_ARGS(hier, tmc), TP_STRUCT__entry( __field( void *, parent ) __field( unsigned int, cpu ) __field( unsigned int, lvl ) __field( unsigned int, numa_node ) + __field( unsigned int, capacity ) __field( unsigned int, num_children ) __field( u32, groupmask ) ), @@ -80,13 +83,14 @@ TRACE_EVENT(tmigr_connect_cpu_parent, __entry->cpu = tmc->cpuevt.cpu; __entry->lvl = tmc->tmgroup->level; __entry->numa_node = tmc->tmgroup->numa_node; + __entry->capacity = hier->capacity; __entry->num_children = tmc->tmgroup->num_children; __entry->groupmask = tmc->groupmask; ), - TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d", - __entry->cpu, __entry->groupmask, __entry->parent, - __entry->lvl, __entry->numa_node, __entry->num_children) + TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d capacity=%d num_children=%d", + __entry->cpu, __entry->groupmask, __entry->parent, __entry->lvl, + __entry->numa_node, __entry->capacity, __entry->num_children) ); DECLARE_EVENT_CLASS(tmigr_group_and_cpu, -- cgit v1.2.3