summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@kernel.org>2026-03-17 10:01:54 +0100
committerThomas Gleixner <tglx@kernel.org>2026-03-20 13:36:32 +0100
commit763aacf86f1baefb134c70813aa8c72d1675d738 (patch)
tree4d128fc3b703cf89bb37689eb25d7d04ee902ca1 /include
parent1432f9d4e8aa2d7585b678bdd0b740597af00d6e (diff)
clocksource: Rewrite watchdog code completely
The clocksource watchdog code has over time reached the state of an impenetrable maze of duct tape and staples. The original design, which was made in the context of systems far smaller than today, is based on the assumption that the to be monitored clocksource (TSC) can be trivially compared against a known to be stable clocksource (HPET/ACPI-PM timer). Over the years it turned out that this approach has major flaws: - Long delays between watchdog invocations can result in wrap arounds of the reference clocksource - Scalability of the reference clocksource readout can degrade on large multi-socket systems due to interconnect congestion This was addressed with various heuristics which degraded the accuracy of the watchdog to the point that it fails to detect actual TSC problems on older hardware which exposes slow inter CPU drifts due to firmware manipulating the TSC to hide SMI time. To address this and bring back sanity to the watchdog, rewrite the code completely with a different approach: 1) Restrict the validation against a reference clocksource to the boot CPU, which is usually the CPU/Socket closest to the legacy block which contains the reference source (HPET/ACPI-PM timer). Validate that the reference readout is within a bound latency so that the actual comparison against the TSC stays within 500ppm as long as the clocks are stable. 2) Compare the TSCs of the other CPUs in a round robin fashion against the boot CPU in the same way the TSC synchronization on CPU hotplug works. This still can suffer from delayed reaction of the remote CPU to the SMP function call and the latency of the control variable cache line. But this latency is not affecting correctness. It only affects the accuracy. With low contention the readout latency is in the low nanoseconds range, which detects even slight skews between CPUs. Under high contention this becomes obviously less accurate, but still detects slow skews reliably as it solely relies on subsequent readouts being monotonically increasing. It just can take slightly longer to detect the issue. 3) Rewrite the watchdog test so it tests the various mechanisms one by one and validating the result against the expectation. Signed-off-by: Thomas Gleixner <tglx@kernel.org> Tested-by: Borislav Petkov (AMD) <bp@alien8.de> Tested-by: Daniel J Blueman <daniel@quora.org> Reviewed-by: Jiri Wiesner <jwiesner@suse.de> Reviewed-by: Daniel J Blueman <daniel@quora.org> Link: https://patch.msgid.link/20260123231521.926490888@kernel.org Link: https://patch.msgid.link/87h5qeomm5.ffs@tglx
Diffstat (limited to 'include')
-rw-r--r--include/linux/clocksource.h28
1 files changed, 7 insertions, 21 deletions
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 25774fc5b53d..ccf5c0ca26b7 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -44,8 +44,6 @@ struct module;
* @shift: Cycle to nanosecond divisor (power of two)
* @max_idle_ns: Maximum idle time permitted by the clocksource (nsecs)
* @maxadj: Maximum adjustment value to mult (~11%)
- * @uncertainty_margin: Maximum uncertainty in nanoseconds per half second.
- * Zero says to use default WATCHDOG_THRESHOLD.
* @archdata: Optional arch-specific data
* @max_cycles: Maximum safe cycle value which won't overflow on
* multiplication
@@ -105,7 +103,6 @@ struct clocksource {
u32 shift;
u64 max_idle_ns;
u32 maxadj;
- u32 uncertainty_margin;
#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
struct arch_clocksource_data archdata;
#endif
@@ -133,6 +130,7 @@ struct clocksource {
struct list_head wd_list;
u64 cs_last;
u64 wd_last;
+ unsigned int wd_cpu;
#endif
struct module *owner;
};
@@ -142,15 +140,18 @@ struct clocksource {
*/
#define CLOCK_SOURCE_IS_CONTINUOUS 0x01
#define CLOCK_SOURCE_MUST_VERIFY 0x02
+#define CLOCK_SOURCE_CALIBRATED 0x04
#define CLOCK_SOURCE_WATCHDOG 0x10
#define CLOCK_SOURCE_VALID_FOR_HRES 0x20
#define CLOCK_SOURCE_UNSTABLE 0x40
#define CLOCK_SOURCE_SUSPEND_NONSTOP 0x80
#define CLOCK_SOURCE_RESELECT 0x100
-#define CLOCK_SOURCE_VERIFY_PERCPU 0x200
-#define CLOCK_SOURCE_CAN_INLINE_READ 0x400
-#define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x800
+#define CLOCK_SOURCE_CAN_INLINE_READ 0x200
+#define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x400
+
+#define CLOCK_SOURCE_WDTEST 0x800
+#define CLOCK_SOURCE_WDTEST_PERCPU 0x1000
/* simplify initialization of mask field */
#define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0)
@@ -301,21 +302,6 @@ static inline void timer_probe(void) {}
#define TIMER_ACPI_DECLARE(name, table_id, fn) \
ACPI_DECLARE_PROBE_ENTRY(timer, name, table_id, 0, NULL, 0, fn)
-static inline unsigned int clocksource_get_max_watchdog_retry(void)
-{
- /*
- * When system is in the boot phase or under heavy workload, there
- * can be random big latencies during the clocksource/watchdog
- * read, so allow retries to filter the noise latency. As the
- * latency's frequency and maximum value goes up with the number of
- * CPUs, scale the number of retries with the number of online
- * CPUs.
- */
- return (ilog2(num_online_cpus()) / 2) + 1;
-}
-
-void clocksource_verify_percpu(struct clocksource *cs);
-
/**
* struct clocksource_base - hardware abstraction for clock on which a clocksource
* is based