From 25ff69011ddf9ec73114382dc90040a4cad490b0 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 15 Dec 2025 13:12:29 +0200 Subject: intel_idle: Remove unused driver version constant The INTEL_IDLE_VERSION constant has not been updated since 2020 and serves no useful purpose. The driver version is implicitly defined by the kernel version, making this constant redundant. Remove the constant to eliminate potential confusion about version tracking. Signed-off-by: Artem Bityutskiy Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20251215111229.132705-1-dedekind1@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 9ba83954c255..aa44b3c2cb2c 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -63,8 +63,6 @@ #include #include -#define INTEL_IDLE_VERSION "0.5.1" - static struct cpuidle_driver intel_idle_driver = { .name = "intel_idle", .owner = THIS_MODULE, @@ -2478,9 +2476,6 @@ static int __init intel_idle_init(void) return -ENODEV; } - pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n", - boot_cpu_data.x86_model); - intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); if (!intel_idle_cpuidle_devices) return -ENOMEM; -- cgit v1.2.3 From a36dc37b56722bc114d5dd5657b884334031eb49 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 15 Dec 2025 13:13:00 +0200 Subject: intel_idle: Remove the 'preferred_cstates' parameter Remove the 'preferred_cstates' module parameter as it is not really useful. The parameter currently only affects Alder Lake, where it controls C1/C1E preference, with C1E being the default. The parameter does not support any other platform. For example, Meteor Lake has a similar C1/C1E limitation, but the parameter does not support Meteor Lake. This indicates that the parameter is not very useful. Generally, independent C1 and C1E are important for server platforms where low latency is key. However, they are not as important for client platforms, like Alder Lake, where C1E providing better energy savings is generally preferred. The parameter was originally introduced for Sapphire Rapids Xeon: da0e58c038e6 intel_idle: add 'preferred_cstates' module argument Later it was added to Alder Lake: d1cf8bbfed1ed ("intel_idle: Add AlderLake support") But it was removed from Sapphire Rapids when firmware fixed the C1/C1E limitation: 1548fac47a114 ("intel_idle: make SPR C1 and C1E be independent") So Alder Lake is the only platform left where this parameter has any effect. Remove this parameter to simplify the driver and reduce maintenance burden. Signed-off-by: Artem Bityutskiy Link: https://patch.msgid.link/20251215111300.132803-1-dedekind1@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 36 ------------------------------------ 1 file changed, 36 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index aa44b3c2cb2c..2d67a091ed3f 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -70,7 +70,6 @@ static struct cpuidle_driver intel_idle_driver = { /* intel_idle.max_cstate=0 disables driver */ static int max_cstate = CPUIDLE_STATE_MAX - 1; static unsigned int disabled_states_mask __read_mostly; -static unsigned int preferred_states_mask __read_mostly; static bool force_irq_on __read_mostly; static bool ibrs_off __read_mostly; @@ -2049,25 +2048,6 @@ static void __init skx_idle_state_table_update(void) } } -/** - * adl_idle_state_table_update - Adjust AlderLake idle states table. - */ -static void __init adl_idle_state_table_update(void) -{ - /* Check if user prefers C1 over C1E. */ - if (preferred_states_mask & BIT(1) && !(preferred_states_mask & BIT(2))) { - cpuidle_state_table[0].flags &= ~CPUIDLE_FLAG_UNUSABLE; - cpuidle_state_table[1].flags |= CPUIDLE_FLAG_UNUSABLE; - - /* Disable C1E by clearing the "C1E promotion" bit. */ - c1e_promotion = C1E_PROMOTION_DISABLE; - return; - } - - /* Make sure C1E is enabled by default */ - c1e_promotion = C1E_PROMOTION_ENABLE; -} - /** * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table. */ @@ -2174,11 +2154,6 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) case INTEL_EMERALDRAPIDS_X: spr_idle_state_table_update(); break; - case INTEL_ALDERLAKE: - case INTEL_ALDERLAKE_L: - case INTEL_ATOM_GRACEMONT: - adl_idle_state_table_update(); - break; case INTEL_ATOM_SILVERMONT: case INTEL_ATOM_AIRMONT: byt_cht_auto_demotion_disable(); @@ -2532,17 +2507,6 @@ module_param(max_cstate, int, 0444); */ module_param_named(states_off, disabled_states_mask, uint, 0444); MODULE_PARM_DESC(states_off, "Mask of disabled idle states"); -/* - * Some platforms come with mutually exclusive C-states, so that if one is - * enabled, the other C-states must not be used. Example: C1 and C1E on - * Sapphire Rapids platform. This parameter allows for selecting the - * preferred C-states among the groups of mutually exclusive C-states - the - * selected C-states will be registered, the other C-states from the mutually - * exclusive group won't be registered. If the platform has no mutually - * exclusive C-states, this parameter has no effect. - */ -module_param_named(preferred_cstates, preferred_states_mask, uint, 0444); -MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states"); /* * Debugging option that forces the driver to enter all C-states with * interrupts enabled. Does not apply to C-states with -- cgit v1.2.3 From ff24f314447a25164bac85cb310c382e289afdbe Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 16 Dec 2025 10:04:00 +0200 Subject: intel_idle: Initialize sysfs after cpuidle driver initialization Reorder initialization calls to initialize the internal driver data before sysfs: Was: intel_idle_sysfs_init(); intel_idle_cpuidle_driver_init(); Now: intel_idle_cpuidle_driver_init(); intel_idle_sysfs_init(); Follow the general principle that drivers should initialize internal state before registering external interfaces like sysfs, avoiding potential usage before full initialization. Signed-off-by: Artem Bityutskiy Link: https://patch.msgid.link/20251216080402.156988-2-dedekind1@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 2d67a091ed3f..f64463e00df7 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -2455,12 +2455,12 @@ static int __init intel_idle_init(void) if (!intel_idle_cpuidle_devices) return -ENOMEM; + intel_idle_cpuidle_driver_init(&intel_idle_driver); + retval = intel_idle_sysfs_init(); if (retval) pr_warn("failed to initialized sysfs"); - intel_idle_cpuidle_driver_init(&intel_idle_driver); - retval = cpuidle_register_driver(&intel_idle_driver); if (retval) { struct cpuidle_driver *drv = cpuidle_get_driver(); -- cgit v1.2.3 From 111f77a233484cf39a6317f4d0306387e9ffda7b Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 16 Dec 2025 10:04:01 +0200 Subject: intel_idle: Add cmdline option to adjust C-states table Add a new module parameter that allows adjusting the C-states table used by the driver. Currently, the C-states table is hardcoded in the driver based on the CPU model. The goal is to have good enough defaults for most users. However, C-state characteristics, such as exit latency and residency, can vary between different variants of the same CPU model and BIOS settings. Moreover, different platform usage models and user preferences may benefit from different C-state target_residency values. Provide a way for users to adjust the C-states table via a module parameter "table". The general format is: "state1:latency1:target_residency1,state2:latency2:target_residency2,..." In other words, represent each C-state by its name, exit latency (in microseconds), and target residency (in microseconds), separated by colons. Separate multiple C-states by commas. For example, suppose a CPU has 3 C-states with the following characteristics: C1: exit_latency=1, target_residency=2 C1E: exit_latency=10, target_residency=10 C6: exit_latency=100, target_residency=500 Users can specify a custom C-states table as follows: 1. intel_idle.table="C1:2:2,C1E:5:20,C6:150:600" Result: C1: exit_latency=2, target_residency=2 C1E: exit_latency=5, target_residency=20 C6: exit_latency=150, target_residency=600 2. intel_idle.table="C6::400" Result: C1: exit_latency=1, target_residency=2 (unchanged) C1E: exit_latency=10, target_residency=10 (unchanged) C6: exit_latency=100, target_residency=400 (only target_residency changed) Signed-off-by: Artem Bityutskiy Link: https://patch.msgid.link/20251216080402.156988-3-dedekind1@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index f64463e00df7..ab6b86ff9905 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -73,6 +73,10 @@ static unsigned int disabled_states_mask __read_mostly; static bool force_irq_on __read_mostly; static bool ibrs_off __read_mostly; +/* The maximum allowed length for the 'table' module parameter */ +#define MAX_CMDLINE_TABLE_LEN 256 +static char cmdline_table_str[MAX_CMDLINE_TABLE_LEN] __read_mostly; + static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; static unsigned long auto_demotion_disable_flags; @@ -104,6 +108,9 @@ static struct device *sysfs_root __initdata; static const struct idle_cpu *icpu __initdata; static struct cpuidle_state *cpuidle_state_table __initdata; +/* C-states data from the 'intel_idle.table' cmdline parameter */ +static struct cpuidle_state cmdline_states[CPUIDLE_STATE_MAX] __initdata; + static unsigned int mwait_substates __initdata; /* @@ -2393,6 +2400,149 @@ static void __init intel_idle_sysfs_uninit(void) put_device(sysfs_root); } + /** + * get_cmdline_field - Get the current field from a cmdline string. + * @args: The cmdline string to get the current field from. + * @field: Pointer to the current field upon return. + * @sep: The fields separator character. + * + * Examples: + * Input: args="C1:1:1,C1E:2:10", sep=':' + * Output: field="C1", return "1:1,C1E:2:10" + * Input: args="C1:1:1,C1E:2:10", sep=',' + * Output: field="C1:1:1", return "C1E:2:10" + * Ipnut: args="::", sep=':' + * Output: field="", return ":" + * + * Return: The continuation of the cmdline string after the field or NULL. + */ +static char *get_cmdline_field(char *args, char **field, char sep) +{ + unsigned int i; + + for (i = 0; args[i] && !isspace(args[i]); i++) { + if (args[i] == sep) + break; + } + + *field = args; + + if (args[i] != sep) + return NULL; + + args[i] = '\0'; + return args + i + 1; +} + +/** + * cmdline_table_adjust - Adjust the C-states table with data from cmdline. + * @drv: cpuidle driver (assumed to point to intel_idle_driver). + * + * Adjust the C-states table with data from the 'intel_idle.table' module + * parameter (if specified). + */ +static void __init cmdline_table_adjust(struct cpuidle_driver *drv) +{ + char *args = cmdline_table_str; + struct cpuidle_state *state; + int i; + + if (args[0] == '\0') + /* The 'intel_idle.table' module parameter was not specified */ + return; + + /* Create a copy of the C-states table */ + for (i = 0; i < drv->state_count; i++) + cmdline_states[i] = drv->states[i]; + + /* + * Adjust the C-states table copy with data from the 'intel_idle.table' + * module parameter. + */ + while (args) { + char *fields, *name, *val; + + /* + * Get the next C-state definition, which is expected to be + * '::'. Treat "empty" + * fields as unchanged. For example, + * '::' leaves the latency unchanged. + */ + args = get_cmdline_field(args, &fields, ','); + + /* name */ + fields = get_cmdline_field(fields, &name, ':'); + if (!fields) + goto error; + + if (!strcmp(name, "POLL")) { + pr_err("Cannot adjust POLL\n"); + continue; + } + + /* Find the C-state by its name */ + state = NULL; + for (i = 0; i < drv->state_count; i++) { + if (!strcmp(name, drv->states[i].name)) { + state = &cmdline_states[i]; + break; + } + } + + if (!state) { + pr_err("C-state '%s' was not found\n", name); + continue; + } + + /* Latency */ + fields = get_cmdline_field(fields, &val, ':'); + if (!fields) + goto error; + + if (*val) { + if (kstrtouint(val, 0, &state->exit_latency)) + goto error; + } + + /* Target residency */ + fields = get_cmdline_field(fields, &val, ':'); + + if (*val) { + if (kstrtouint(val, 0, &state->target_residency)) + goto error; + } + + /* + * Allow for 3 more fields, but ignore them. Helps to make + * possible future extensions of the cmdline format backward + * compatible. + */ + for (i = 0; fields && i < 3; i++) { + fields = get_cmdline_field(fields, &val, ':'); + if (!fields) + break; + } + + if (fields) { + pr_err("Too many fields for C-state '%s'\n", state->name); + goto error; + } + + pr_info("C-state from cmdline: name=%s, latency=%u, residency=%u\n", + state->name, state->exit_latency, state->target_residency); + } + + /* Copy the adjusted C-states table back */ + for (i = 1; i < drv->state_count; i++) + drv->states[i] = cmdline_states[i]; + + pr_info("Adjusted C-states with data from 'intel_idle.table'\n"); + return; + +error: + pr_info("Failed to adjust C-states with data from 'intel_idle.table'\n"); +} + static int __init intel_idle_init(void) { const struct x86_cpu_id *id; @@ -2456,6 +2606,7 @@ static int __init intel_idle_init(void) return -ENOMEM; intel_idle_cpuidle_driver_init(&intel_idle_driver); + cmdline_table_adjust(&intel_idle_driver); retval = intel_idle_sysfs_init(); if (retval) @@ -2519,3 +2670,21 @@ module_param(force_irq_on, bool, 0444); */ module_param(ibrs_off, bool, 0444); MODULE_PARM_DESC(ibrs_off, "Disable IBRS when idle"); + +/* + * Define the C-states table from a user input string. Expected format is + * 'name:latency:residency', where: + * - name: The C-state name. + * - latency: The C-state exit latency in us. + * - residency: The C-state target residency in us. + * + * Multiple C-states can be defined by separating them with commas: + * 'name1:latency1:residency1,name2:latency2:residency2' + * + * Example: intel_idle.table=C1:1:1,C1E:5:10,C6:100:600 + * + * To leave latency or residency unchanged, use an empty field, for example: + * 'C1:1:1,C1E::10' - leaves C1E latency unchanged. + */ +module_param_string(table, cmdline_table_str, MAX_CMDLINE_TABLE_LEN, 0444); +MODULE_PARM_DESC(table, "Build the C-states table from a user input string"); -- cgit v1.2.3 From be6a150829b375c1b53d7ea5794ccc9edd2e0c9c Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 16 Dec 2025 10:04:02 +0200 Subject: intel_idle: Add C-states validation Add validation for C-states specified via the "table=" module parameter. Treat this module parameter as untrusted input and validate it thoroughly. Signed-off-by: Artem Bityutskiy Link: https://patch.msgid.link/20251216080402.156988-4-dedekind1@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index ab6b86ff9905..f49c939d636f 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,11 @@ static bool ibrs_off __read_mostly; /* The maximum allowed length for the 'table' module parameter */ #define MAX_CMDLINE_TABLE_LEN 256 +/* Maximum allowed C-state latency */ +#define MAX_CMDLINE_LATENCY_US (5 * USEC_PER_MSEC) +/* Maximum allowed C-state target residency */ +#define MAX_CMDLINE_RESIDENCY_US (100 * USEC_PER_MSEC) + static char cmdline_table_str[MAX_CMDLINE_TABLE_LEN] __read_mostly; static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; @@ -2434,6 +2440,41 @@ static char *get_cmdline_field(char *args, char **field, char sep) return args + i + 1; } +/** + * validate_cmdline_cstate - Validate a C-state from cmdline. + * @state: The C-state to validate. + * @prev_state: The previous C-state in the table or NULL. + * + * Return: 0 if the C-state is valid or -EINVAL otherwise. + */ +static int validate_cmdline_cstate(struct cpuidle_state *state, + struct cpuidle_state *prev_state) +{ + if (state->exit_latency == 0) + /* Exit latency 0 can only be used for the POLL state */ + return -EINVAL; + + if (state->exit_latency > MAX_CMDLINE_LATENCY_US) + return -EINVAL; + + if (state->target_residency > MAX_CMDLINE_RESIDENCY_US) + return -EINVAL; + + if (state->target_residency < state->exit_latency) + return -EINVAL; + + if (!prev_state) + return 0; + + if (state->exit_latency <= prev_state->exit_latency) + return -EINVAL; + + if (state->target_residency <= prev_state->target_residency) + return -EINVAL; + + return 0; +} + /** * cmdline_table_adjust - Adjust the C-states table with data from cmdline. * @drv: cpuidle driver (assumed to point to intel_idle_driver). @@ -2532,6 +2573,19 @@ static void __init cmdline_table_adjust(struct cpuidle_driver *drv) state->name, state->exit_latency, state->target_residency); } + /* Validate the adjusted C-states, start with index 1 to skip POLL */ + for (i = 1; i < drv->state_count; i++) { + struct cpuidle_state *prev_state; + + state = &cmdline_states[i]; + prev_state = &cmdline_states[i - 1]; + + if (validate_cmdline_cstate(state, prev_state)) { + pr_err("C-state '%s' validation failed\n", state->name); + goto error; + } + } + /* Copy the adjusted C-states table back */ for (i = 1; i < drv->state_count; i++) drv->states[i] = cmdline_states[i]; -- cgit v1.2.3 From fcbd7897b871e157ee5c595e950c8466d86c0cd5 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 5 Jan 2026 06:37:06 -0800 Subject: cpuidle: menu: Remove incorrect unlikely() annotation The unlikely() annotation on the early-return condition in menu_select() is incorrect on systems with only one idle state (e.g., ARM64 servers with a single ACPI LPI state). Branch profiling shows 100% misprediction on such systems since drv->state_count <= 1 is always true. On platforms where only state0 is available, this path is the common case, not an unlikely edge case. Remove the misleading annotation to let the branch predictor learn the actual behavior. Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260105-annotated_idle-v1-1-10ddf0771b58@debian.org Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 64d6f7a1c776..ef9c5a84643e 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -271,7 +271,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->bucket = BUCKETS - 1; } - if (unlikely(drv->state_count <= 1 || latency_req == 0) || + if (drv->state_count <= 1 || latency_req == 0 || ((data->next_timer_ns < drv->states[1].target_residency_ns || latency_req < drv->states[1].exit_latency_ns) && !dev->states_usage[0].disable)) { -- cgit v1.2.3 From fd0d2872dc53fe55f66842767e952457348b8d18 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Tue, 6 Jan 2026 13:36:53 +0000 Subject: MAINTAINERS: Add myself as cpuidle reviewer I've been reviewing cpuidle changes, for governors in particular, for the last couple of years and will continue to do so. Signed-off-by: Christian Loehle Link: https://patch.msgid.link/71f63cb7-2d9b-49a3-9b04-a47e2edef5e0@arm.com Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 765ad2daa218..ea1d4c85b865 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6554,6 +6554,7 @@ F: rust/kernel/cpu.rs CPU IDLE TIME MANAGEMENT FRAMEWORK M: "Rafael J. Wysocki" M: Daniel Lezcano +R: Christian Loehle L: linux-pm@vger.kernel.org S: Maintained B: https://bugzilla.kernel.org -- cgit v1.2.3 From 80606f4eb8d7484ab7f7d6f0fd30d71e6fbcf328 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 20 Jan 2026 16:26:14 +0100 Subject: cpuidle: governors: menu: Always check timers with tick stopped After commit 5484e31bbbff ("cpuidle: menu: Skip tick_nohz_get_sleep_length() call in some cases"), if the return value of get_typical_interval() multiplied by NSEC_PER_USEC is not greater than RESIDENCY_THRESHOLD_NS, the menu governor will skip computing the time till the closest timer. If that happens when the tick has been stopped already, the selected idle state may be too deep due to the subsequent check comparing predicted_ns with TICK_NSEC and causing its value to be replaced with the expected time till the closest timer, which is KTIME_MAX in that case. That will cause the deepest enabled idle state to be selected, but the time till the closest timer very well may be shorter than the target residency of that state, in which case a shallower state should be used. Address this by making menu_select() always compute the time till the closest timer when the tick has been stopped. Also move the predicted_ns check mentioned above into the branch in which the time till the closest timer is determined because it only needs to be done in that case. Fixes: 5484e31bbbff ("cpuidle: menu: Skip tick_nohz_get_sleep_length() call in some cases") Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/5959091.DvuYhMxLoT@rafael.j.wysocki --- drivers/cpuidle/governors/menu.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index ef9c5a84643e..c6052055ba0f 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -239,7 +239,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, /* Find the shortest expected idle interval. */ predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; - if (predicted_ns > RESIDENCY_THRESHOLD_NS) { + if (predicted_ns > RESIDENCY_THRESHOLD_NS || tick_nohz_tick_stopped()) { unsigned int timer_us; /* Determine the time till the closest timer. */ @@ -259,6 +259,16 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, RESOLUTION * DECAY * NSEC_PER_USEC); /* Use the lowest expected idle interval to pick the idle state. */ predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns); + /* + * If the tick is already stopped, the cost of possible short + * idle duration misprediction is much higher, because the CPU + * may be stuck in a shallow idle state for a long time as a + * result of it. In that case, say we might mispredict and use + * the known time till the closest timer event for the idle + * state selection. + */ + if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC) + predicted_ns = data->next_timer_ns; } else { /* * Because the next timer event is not going to be determined @@ -284,16 +294,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, return 0; } - /* - * If the tick is already stopped, the cost of possible short idle - * duration misprediction is much higher, because the CPU may be stuck - * in a shallow idle state for a long time as a result of it. In that - * case, say we might mispredict and use the known time till the closest - * timer event for the idle state selection. - */ - if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC) - predicted_ns = data->next_timer_ns; - /* * Find the idle state with the lowest power while satisfying * our constraints. -- cgit v1.2.3 From 4bd2221f231d798b01027367857d9ba2f24f6ea0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 14 Jan 2026 20:44:04 +0100 Subject: cpuidle: governors: teo: Avoid selecting states with zero-size bins If the last two enabled idle states have the same target residency which is at least equal to TICK_NSEC, teo may select the next-to-last one even though the size of that state's bin is 0, which is confusing. Prevent that from happening by adding a target residency check to the relevant code path. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle [ rjw: Fixed a typo in the changelog ] Link: https://patch.msgid.link/3033265.e9J7NaK4W3@rafael.j.wysocki Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 81ac5fd58a1c..9820ef36a664 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -388,6 +388,15 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, while (min_idx < idx && drv->states[min_idx].target_residency_ns < TICK_NSEC) min_idx++; + + /* + * Avoid selecting a state with a lower index, but with + * the same target residency as the current candidate + * one. + */ + if (drv->states[min_idx].target_residency_ns == + drv->states[idx].target_residency_ns) + goto constraint; } /* @@ -410,6 +419,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } } +constraint: /* * If there is a latency constraint, it may be necessary to select an * idle state shallower than the current candidate one. -- cgit v1.2.3 From 60836533b4c7b69e6cb815c87f089e39c2878acd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 14 Jan 2026 20:44:53 +0100 Subject: cpuidle: governors: teo: Avoid fake intercepts produced by tick Tick wakeups can lead to fake intercepts that may skew idle state selection towards shallow states, so it is better to avoid counting them as intercepts. For this purpose, add a check causing teo_update() to only count tick wakeups as intercepts if intercepts within the tick period range are at least twice as frequent as any other events. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/3404606.44csPzL39Z@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 9820ef36a664..5434584af040 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -239,6 +239,17 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->state_bins[drv->state_count-1].hits += PULSE; return; } + /* + * If intercepts within the tick period range are not frequent + * enough, count this wakeup as a hit, since it is likely that + * the tick has woken up the CPU because an expected intercept + * was not there. Otherwise, one of the intercepts may have + * been incidentally preceded by the tick wakeup. + */ + if (3 * cpu_data->tick_intercepts < 2 * total) { + cpu_data->state_bins[idx_timer].hits += PULSE; + return; + } } /* -- cgit v1.2.3 From 475ca3470b3739150720f1b285646de38103e7b7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 14 Jan 2026 20:45:30 +0100 Subject: cpuidle: governors: teo: Refine tick_intercepts vs total events check Use 2/3 as the proportion coefficient in the check comparing cpu_data->tick_intercepts with cpu_data->total because it is close enough to the current one (5/8) and it allows of more straightforward interpretation (on average, intercepts within the tick period length are twice as frequent as other events). Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/10793374.nUPlyArG6x@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 5434584af040..750ab0678a77 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -485,7 +485,7 @@ constraint: * total wakeup events, do not stop the tick. */ if (drv->states[idx].target_residency_ns < TICK_NSEC && - cpu_data->tick_intercepts > cpu_data->total / 2 + cpu_data->total / 8) + 3 * cpu_data->tick_intercepts >= 2 * cpu_data->total) duration_ns = TICK_NSEC / 2; end: -- cgit v1.2.3 From f36de72673ad80c9931c0b411df0d6ef184f6c22 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 29 Jan 2026 21:49:12 +0100 Subject: cpuidle: governors: teo: Adjust the classification of wakeup events If differences between target residency values of adjacent idle states of a given CPU are relatively large, the corresponding idle state bins used by the teo governors are large either and the rule by which hits are distinguished from intercepts is inaccurate. Namely, by that rule, a wakeup event is classified as a hit if the sleep length (the time till the closest timer other than the tick) and the measured idle duration, adjusted for the entered idle state exit latency, fall into the same idle state bin. However, if that bin is large enough, the actual difference between the sleep length and the measured idle duration may be significant. It may in fact be significantly greater than the analogous difference for an event where the sleep length and the measured idle duration fall into different bins. For this reason, amend the rule in question with a check that will only allow a wakeup event to be counted as a hit if the sleep length is less than the "raw" measured idle duration (which means that the wakeup appears to have occurred after the anticipated timer event). Otherwise, the event will be counted as an intercept. Also update the documentation part explaining the difference between "hits" and "intercepts" to take the above change into account. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/5093379.31r3eYUQgx@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 750ab0678a77..34b769b37a86 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -48,12 +48,11 @@ * in accordance with what happened last time. * * The "hits" metric reflects the relative frequency of situations in which the - * sleep length and the idle duration measured after CPU wakeup fall into the - * same bin (that is, the CPU appears to wake up "on time" relative to the sleep - * length). In turn, the "intercepts" metric reflects the relative frequency of - * non-timer wakeup events for which the measured idle duration falls into a bin - * that corresponds to an idle state shallower than the one whose bin is fallen - * into by the sleep length (these events are also referred to as "intercepts" + * sleep length and the idle duration measured after CPU wakeup are close enough + * (that is, the CPU appears to wake up "on time" relative to the sleep length). + * In turn, the "intercepts" metric reflects the relative frequency of non-timer + * wakeup events for which the measured idle duration is significantly different + * from the sleep length (these events are also referred to as "intercepts" * below). * * The governor also counts "intercepts" with the measured idle duration below @@ -167,6 +166,7 @@ static void teo_decay(unsigned int *metric) */ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { + s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); int i, idx_timer = 0, idx_duration = 0; s64 target_residency_ns, measured_ns; @@ -182,8 +182,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) */ measured_ns = S64_MAX; } else { - s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; - measured_ns = dev->last_residency_ns; /* * The delay between the wakeup and the first instruction @@ -253,12 +251,17 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) } /* - * If the measured idle duration falls into the same bin as the sleep - * length, this is a "hit", so update the "hits" metric for that bin. + * If the measured idle duration (adjusted for the entered state exit + * latency) falls into the same bin as the sleep length and the latter + * is less than the "raw" measured idle duration (so the wakeup appears + * to have occurred after the anticipated timer event), this is a "hit", + * so update the "hits" metric for that bin. + * * Otherwise, update the "intercepts" metric for the bin fallen into by * the measured idle duration. */ - if (idx_timer == idx_duration) { + if (idx_timer == idx_duration && + cpu_data->sleep_length_ns - measured_ns < lat_ns / 2) { cpu_data->state_bins[idx_timer].hits += PULSE; } else { cpu_data->state_bins[idx_duration].intercepts += PULSE; -- cgit v1.2.3 From a971f984b8455db0ef23910442029cdad53bc459 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 29 Jan 2026 21:51:11 +0100 Subject: cpuidle: governors: teo: Refine intercepts-based idle state lookup There are cases in which decisions made by the teo governor are arguably overly conservative. For instance, suppose that there are 4 idle states and the values of the intercepts metric for the first 3 of them are 400, 250, and 251, respectively. If the total sum computed in teo_update() is 1000, the governor will select idle state 1 (provided that all idle states are enabled and the scheduler tick has not been stopped) although arguably idle state 0 would be a better choice because the likelihood of getting an idle duration below the target residency of idle state 1 is greater than the likelihood of getting an idle duration between the target residency of idle state 1 and the target residency of idle state 2. To address this, refine the candidate idle state lookup based on intercepts to start at the state with the maximum intercepts metric, below the deepest enabled one, to avoid the cases in which the search may stop before reaching that state. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle [ rjw: Fixed typo "intercetps" in new comments (3 places) ] Link: https://patch.msgid.link/2417298.ElGaqSPkdT@rafael.j.wysocki Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 50 +++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 34b769b37a86..80f3ba942a06 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -74,12 +74,17 @@ * than the candidate one (it represents the cases in which the CPU was * likely woken up by a non-timer wakeup source). * + * Also find the idle state with the maximum intercepts metric (if there are + * multiple states with the maximum intercepts metric, choose the one with + * the highest index). + * * 2. If the second sum computed in step 1 is greater than a half of the sum of * both metrics for the candidate state bin and all subsequent bins (if any), * a shallower idle state is likely to be more suitable, so look for it. * * - Traverse the enabled idle states shallower than the candidate one in the - * descending order. + * descending order, starting at the state with the maximum intercepts + * metric found in step 1. * * - For each of them compute the sum of the "intercepts" metrics over all * of the idle states between it and the candidate one (including the @@ -308,8 +313,10 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, ktime_t delta_tick = TICK_NSEC / 2; unsigned int idx_intercept_sum = 0; unsigned int intercept_sum = 0; + unsigned int intercept_max = 0; unsigned int idx_hit_sum = 0; unsigned int hit_sum = 0; + int intercept_max_idx = -1; int constraint_idx = 0; int idx0 = 0, idx = -1; s64 duration_ns; @@ -340,17 +347,32 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (!dev->states_usage[0].disable) idx = 0; - /* Compute the sums of metrics for early wakeup pattern detection. */ + /* + * Compute the sums of metrics for early wakeup pattern detection and + * look for the state bin with the maximum intercepts metric below the + * deepest enabled one (if there are multiple states with the maximum + * intercepts metric, choose the one with the highest index). + */ for (i = 1; i < drv->state_count; i++) { struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; + unsigned int prev_intercepts = prev_bin->intercepts; struct cpuidle_state *s = &drv->states[i]; /* * Update the sums of idle state metrics for all of the states * shallower than the current one. */ - intercept_sum += prev_bin->intercepts; hit_sum += prev_bin->hits; + intercept_sum += prev_intercepts; + /* + * Check if this is the bin with the maximum number of + * intercepts so far and in that case update the index of + * the state with the maximum intercepts metric. + */ + if (prev_intercepts >= intercept_max) { + intercept_max = prev_intercepts; + intercept_max_idx = i - 1; + } if (dev->states_usage[i].disable) continue; @@ -414,9 +436,22 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } /* - * Look for the deepest idle state whose target residency had - * not exceeded the idle duration in over a half of the relevant - * cases in the past. + * If the minimum state index is greater than or equal to the + * index of the state with the maximum intercepts metric and + * the corresponding state is enabled, there is no need to look + * at the deeper states. + */ + if (min_idx >= intercept_max_idx && + !dev->states_usage[min_idx].disable) { + idx = min_idx; + goto constraint; + } + + /* + * Look for the deepest enabled idle state, at most as deep as + * the one with the maximum intercepts metric, whose target + * residency had not been greater than the idle duration in over + * a half of the relevant cases in the past. * * Take the possible duration limitation present if the tick * has been stopped already into account. @@ -428,7 +463,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, continue; idx = i; - if (2 * intercept_sum > idx_intercept_sum) + if (2 * intercept_sum > idx_intercept_sum && + i <= intercept_max_idx) break; } } -- cgit v1.2.3