diff options
| -rw-r--r-- | MAINTAINERS | 1 | ||||
| -rw-r--r-- | drivers/cpuidle/governors/menu.c | 24 | ||||
| -rw-r--r-- | drivers/cpuidle/governors/teo.c | 98 | ||||
| -rw-r--r-- | drivers/idle/intel_idle.c | 268 |
4 files changed, 317 insertions, 74 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 083ac5e3f3f0..7fe9a770af78 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6561,6 +6561,7 @@ F: rust/kernel/cpu.rs CPU IDLE TIME MANAGEMENT FRAMEWORK M: "Rafael J. Wysocki" <rafael@kernel.org> M: Daniel Lezcano <daniel.lezcano@linaro.org> +R: Christian Loehle <christian.loehle@arm.com> L: linux-pm@vger.kernel.org S: Maintained B: https://bugzilla.kernel.org diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 64d6f7a1c776..c6052055ba0f 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -239,7 +239,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, /* Find the shortest expected idle interval. */ predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; - if (predicted_ns > RESIDENCY_THRESHOLD_NS) { + if (predicted_ns > RESIDENCY_THRESHOLD_NS || tick_nohz_tick_stopped()) { unsigned int timer_us; /* Determine the time till the closest timer. */ @@ -259,6 +259,16 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, RESOLUTION * DECAY * NSEC_PER_USEC); /* Use the lowest expected idle interval to pick the idle state. */ predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns); + /* + * If the tick is already stopped, the cost of possible short + * idle duration misprediction is much higher, because the CPU + * may be stuck in a shallow idle state for a long time as a + * result of it. In that case, say we might mispredict and use + * the known time till the closest timer event for the idle + * state selection. + */ + if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC) + predicted_ns = data->next_timer_ns; } else { /* * Because the next timer event is not going to be determined @@ -271,7 +281,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->bucket = BUCKETS - 1; } - if (unlikely(drv->state_count <= 1 || latency_req == 0) || + if (drv->state_count <= 1 || latency_req == 0 || ((data->next_timer_ns < drv->states[1].target_residency_ns || latency_req < drv->states[1].exit_latency_ns) && !dev->states_usage[0].disable)) { @@ -285,16 +295,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } /* - * If the tick is already stopped, the cost of possible short idle - * duration misprediction is much higher, because the CPU may be stuck - * in a shallow idle state for a long time as a result of it. In that - * case, say we might mispredict and use the known time till the closest - * timer event for the idle state selection. - */ - if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC) - predicted_ns = data->next_timer_ns; - - /* * Find the idle state with the lowest power while satisfying * our constraints. */ diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 81ac5fd58a1c..80f3ba942a06 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -48,12 +48,11 @@ * in accordance with what happened last time. * * The "hits" metric reflects the relative frequency of situations in which the - * sleep length and the idle duration measured after CPU wakeup fall into the - * same bin (that is, the CPU appears to wake up "on time" relative to the sleep - * length). In turn, the "intercepts" metric reflects the relative frequency of - * non-timer wakeup events for which the measured idle duration falls into a bin - * that corresponds to an idle state shallower than the one whose bin is fallen - * into by the sleep length (these events are also referred to as "intercepts" + * sleep length and the idle duration measured after CPU wakeup are close enough + * (that is, the CPU appears to wake up "on time" relative to the sleep length). + * In turn, the "intercepts" metric reflects the relative frequency of non-timer + * wakeup events for which the measured idle duration is significantly different + * from the sleep length (these events are also referred to as "intercepts" * below). * * The governor also counts "intercepts" with the measured idle duration below @@ -75,12 +74,17 @@ * than the candidate one (it represents the cases in which the CPU was * likely woken up by a non-timer wakeup source). * + * Also find the idle state with the maximum intercepts metric (if there are + * multiple states with the maximum intercepts metric, choose the one with + * the highest index). + * * 2. If the second sum computed in step 1 is greater than a half of the sum of * both metrics for the candidate state bin and all subsequent bins (if any), * a shallower idle state is likely to be more suitable, so look for it. * * - Traverse the enabled idle states shallower than the candidate one in the - * descending order. + * descending order, starting at the state with the maximum intercepts + * metric found in step 1. * * - For each of them compute the sum of the "intercepts" metrics over all * of the idle states between it and the candidate one (including the @@ -167,6 +171,7 @@ static void teo_decay(unsigned int *metric) */ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { + s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); int i, idx_timer = 0, idx_duration = 0; s64 target_residency_ns, measured_ns; @@ -182,8 +187,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) */ measured_ns = S64_MAX; } else { - s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; - measured_ns = dev->last_residency_ns; /* * The delay between the wakeup and the first instruction @@ -239,15 +242,31 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->state_bins[drv->state_count-1].hits += PULSE; return; } + /* + * If intercepts within the tick period range are not frequent + * enough, count this wakeup as a hit, since it is likely that + * the tick has woken up the CPU because an expected intercept + * was not there. Otherwise, one of the intercepts may have + * been incidentally preceded by the tick wakeup. + */ + if (3 * cpu_data->tick_intercepts < 2 * total) { + cpu_data->state_bins[idx_timer].hits += PULSE; + return; + } } /* - * If the measured idle duration falls into the same bin as the sleep - * length, this is a "hit", so update the "hits" metric for that bin. + * If the measured idle duration (adjusted for the entered state exit + * latency) falls into the same bin as the sleep length and the latter + * is less than the "raw" measured idle duration (so the wakeup appears + * to have occurred after the anticipated timer event), this is a "hit", + * so update the "hits" metric for that bin. + * * Otherwise, update the "intercepts" metric for the bin fallen into by * the measured idle duration. */ - if (idx_timer == idx_duration) { + if (idx_timer == idx_duration && + cpu_data->sleep_length_ns - measured_ns < lat_ns / 2) { cpu_data->state_bins[idx_timer].hits += PULSE; } else { cpu_data->state_bins[idx_duration].intercepts += PULSE; @@ -294,8 +313,10 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, ktime_t delta_tick = TICK_NSEC / 2; unsigned int idx_intercept_sum = 0; unsigned int intercept_sum = 0; + unsigned int intercept_max = 0; unsigned int idx_hit_sum = 0; unsigned int hit_sum = 0; + int intercept_max_idx = -1; int constraint_idx = 0; int idx0 = 0, idx = -1; s64 duration_ns; @@ -326,17 +347,32 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (!dev->states_usage[0].disable) idx = 0; - /* Compute the sums of metrics for early wakeup pattern detection. */ + /* + * Compute the sums of metrics for early wakeup pattern detection and + * look for the state bin with the maximum intercepts metric below the + * deepest enabled one (if there are multiple states with the maximum + * intercepts metric, choose the one with the highest index). + */ for (i = 1; i < drv->state_count; i++) { struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; + unsigned int prev_intercepts = prev_bin->intercepts; struct cpuidle_state *s = &drv->states[i]; /* * Update the sums of idle state metrics for all of the states * shallower than the current one. */ - intercept_sum += prev_bin->intercepts; hit_sum += prev_bin->hits; + intercept_sum += prev_intercepts; + /* + * Check if this is the bin with the maximum number of + * intercepts so far and in that case update the index of + * the state with the maximum intercepts metric. + */ + if (prev_intercepts >= intercept_max) { + intercept_max = prev_intercepts; + intercept_max_idx = i - 1; + } if (dev->states_usage[i].disable) continue; @@ -388,12 +424,34 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, while (min_idx < idx && drv->states[min_idx].target_residency_ns < TICK_NSEC) min_idx++; + + /* + * Avoid selecting a state with a lower index, but with + * the same target residency as the current candidate + * one. + */ + if (drv->states[min_idx].target_residency_ns == + drv->states[idx].target_residency_ns) + goto constraint; + } + + /* + * If the minimum state index is greater than or equal to the + * index of the state with the maximum intercepts metric and + * the corresponding state is enabled, there is no need to look + * at the deeper states. + */ + if (min_idx >= intercept_max_idx && + !dev->states_usage[min_idx].disable) { + idx = min_idx; + goto constraint; } /* - * Look for the deepest idle state whose target residency had - * not exceeded the idle duration in over a half of the relevant - * cases in the past. + * Look for the deepest enabled idle state, at most as deep as + * the one with the maximum intercepts metric, whose target + * residency had not been greater than the idle duration in over + * a half of the relevant cases in the past. * * Take the possible duration limitation present if the tick * has been stopped already into account. @@ -405,11 +463,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, continue; idx = i; - if (2 * intercept_sum > idx_intercept_sum) + if (2 * intercept_sum > idx_intercept_sum && + i <= intercept_max_idx) break; } } +constraint: /* * If there is a latency constraint, it may be necessary to select an * idle state shallower than the current candidate one. @@ -464,7 +524,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * total wakeup events, do not stop the tick. */ if (drv->states[idx].target_residency_ns < TICK_NSEC && - cpu_data->tick_intercepts > cpu_data->total / 2 + cpu_data->total / 8) + 3 * cpu_data->tick_intercepts >= 2 * cpu_data->total) duration_ns = TICK_NSEC / 2; end: diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 9ba83954c255..f49c939d636f 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -45,6 +45,7 @@ #include <linux/kernel.h> #include <linux/cpuidle.h> #include <linux/tick.h> +#include <linux/time64.h> #include <trace/events/power.h> #include <linux/sched.h> #include <linux/sched/smt.h> @@ -63,8 +64,6 @@ #include <asm/fpu/api.h> #include <asm/smp.h> -#define INTEL_IDLE_VERSION "0.5.1" - static struct cpuidle_driver intel_idle_driver = { .name = "intel_idle", .owner = THIS_MODULE, @@ -72,10 +71,18 @@ static struct cpuidle_driver intel_idle_driver = { /* intel_idle.max_cstate=0 disables driver */ static int max_cstate = CPUIDLE_STATE_MAX - 1; static unsigned int disabled_states_mask __read_mostly; -static unsigned int preferred_states_mask __read_mostly; static bool force_irq_on __read_mostly; static bool ibrs_off __read_mostly; +/* The maximum allowed length for the 'table' module parameter */ +#define MAX_CMDLINE_TABLE_LEN 256 +/* Maximum allowed C-state latency */ +#define MAX_CMDLINE_LATENCY_US (5 * USEC_PER_MSEC) +/* Maximum allowed C-state target residency */ +#define MAX_CMDLINE_RESIDENCY_US (100 * USEC_PER_MSEC) + +static char cmdline_table_str[MAX_CMDLINE_TABLE_LEN] __read_mostly; + static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; static unsigned long auto_demotion_disable_flags; @@ -107,6 +114,9 @@ static struct device *sysfs_root __initdata; static const struct idle_cpu *icpu __initdata; static struct cpuidle_state *cpuidle_state_table __initdata; +/* C-states data from the 'intel_idle.table' cmdline parameter */ +static struct cpuidle_state cmdline_states[CPUIDLE_STATE_MAX] __initdata; + static unsigned int mwait_substates __initdata; /* @@ -2052,25 +2062,6 @@ static void __init skx_idle_state_table_update(void) } /** - * adl_idle_state_table_update - Adjust AlderLake idle states table. - */ -static void __init adl_idle_state_table_update(void) -{ - /* Check if user prefers C1 over C1E. */ - if (preferred_states_mask & BIT(1) && !(preferred_states_mask & BIT(2))) { - cpuidle_state_table[0].flags &= ~CPUIDLE_FLAG_UNUSABLE; - cpuidle_state_table[1].flags |= CPUIDLE_FLAG_UNUSABLE; - - /* Disable C1E by clearing the "C1E promotion" bit. */ - c1e_promotion = C1E_PROMOTION_DISABLE; - return; - } - - /* Make sure C1E is enabled by default */ - c1e_promotion = C1E_PROMOTION_ENABLE; -} - -/** * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table. */ static void __init spr_idle_state_table_update(void) @@ -2176,11 +2167,6 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) case INTEL_EMERALDRAPIDS_X: spr_idle_state_table_update(); break; - case INTEL_ALDERLAKE: - case INTEL_ALDERLAKE_L: - case INTEL_ATOM_GRACEMONT: - adl_idle_state_table_update(); - break; case INTEL_ATOM_SILVERMONT: case INTEL_ATOM_AIRMONT: byt_cht_auto_demotion_disable(); @@ -2420,6 +2406,197 @@ static void __init intel_idle_sysfs_uninit(void) put_device(sysfs_root); } + /** + * get_cmdline_field - Get the current field from a cmdline string. + * @args: The cmdline string to get the current field from. + * @field: Pointer to the current field upon return. + * @sep: The fields separator character. + * + * Examples: + * Input: args="C1:1:1,C1E:2:10", sep=':' + * Output: field="C1", return "1:1,C1E:2:10" + * Input: args="C1:1:1,C1E:2:10", sep=',' + * Output: field="C1:1:1", return "C1E:2:10" + * Ipnut: args="::", sep=':' + * Output: field="", return ":" + * + * Return: The continuation of the cmdline string after the field or NULL. + */ +static char *get_cmdline_field(char *args, char **field, char sep) +{ + unsigned int i; + + for (i = 0; args[i] && !isspace(args[i]); i++) { + if (args[i] == sep) + break; + } + + *field = args; + + if (args[i] != sep) + return NULL; + + args[i] = '\0'; + return args + i + 1; +} + +/** + * validate_cmdline_cstate - Validate a C-state from cmdline. + * @state: The C-state to validate. + * @prev_state: The previous C-state in the table or NULL. + * + * Return: 0 if the C-state is valid or -EINVAL otherwise. + */ +static int validate_cmdline_cstate(struct cpuidle_state *state, + struct cpuidle_state *prev_state) +{ + if (state->exit_latency == 0) + /* Exit latency 0 can only be used for the POLL state */ + return -EINVAL; + + if (state->exit_latency > MAX_CMDLINE_LATENCY_US) + return -EINVAL; + + if (state->target_residency > MAX_CMDLINE_RESIDENCY_US) + return -EINVAL; + + if (state->target_residency < state->exit_latency) + return -EINVAL; + + if (!prev_state) + return 0; + + if (state->exit_latency <= prev_state->exit_latency) + return -EINVAL; + + if (state->target_residency <= prev_state->target_residency) + return -EINVAL; + + return 0; +} + +/** + * cmdline_table_adjust - Adjust the C-states table with data from cmdline. + * @drv: cpuidle driver (assumed to point to intel_idle_driver). + * + * Adjust the C-states table with data from the 'intel_idle.table' module + * parameter (if specified). + */ +static void __init cmdline_table_adjust(struct cpuidle_driver *drv) +{ + char *args = cmdline_table_str; + struct cpuidle_state *state; + int i; + + if (args[0] == '\0') + /* The 'intel_idle.table' module parameter was not specified */ + return; + + /* Create a copy of the C-states table */ + for (i = 0; i < drv->state_count; i++) + cmdline_states[i] = drv->states[i]; + + /* + * Adjust the C-states table copy with data from the 'intel_idle.table' + * module parameter. + */ + while (args) { + char *fields, *name, *val; + + /* + * Get the next C-state definition, which is expected to be + * '<name>:<latency_us>:<target_residency_us>'. Treat "empty" + * fields as unchanged. For example, + * '<name>::<target_residency_us>' leaves the latency unchanged. + */ + args = get_cmdline_field(args, &fields, ','); + + /* name */ + fields = get_cmdline_field(fields, &name, ':'); + if (!fields) + goto error; + + if (!strcmp(name, "POLL")) { + pr_err("Cannot adjust POLL\n"); + continue; + } + + /* Find the C-state by its name */ + state = NULL; + for (i = 0; i < drv->state_count; i++) { + if (!strcmp(name, drv->states[i].name)) { + state = &cmdline_states[i]; + break; + } + } + + if (!state) { + pr_err("C-state '%s' was not found\n", name); + continue; + } + + /* Latency */ + fields = get_cmdline_field(fields, &val, ':'); + if (!fields) + goto error; + + if (*val) { + if (kstrtouint(val, 0, &state->exit_latency)) + goto error; + } + + /* Target residency */ + fields = get_cmdline_field(fields, &val, ':'); + + if (*val) { + if (kstrtouint(val, 0, &state->target_residency)) + goto error; + } + + /* + * Allow for 3 more fields, but ignore them. Helps to make + * possible future extensions of the cmdline format backward + * compatible. + */ + for (i = 0; fields && i < 3; i++) { + fields = get_cmdline_field(fields, &val, ':'); + if (!fields) + break; + } + + if (fields) { + pr_err("Too many fields for C-state '%s'\n", state->name); + goto error; + } + + pr_info("C-state from cmdline: name=%s, latency=%u, residency=%u\n", + state->name, state->exit_latency, state->target_residency); + } + + /* Validate the adjusted C-states, start with index 1 to skip POLL */ + for (i = 1; i < drv->state_count; i++) { + struct cpuidle_state *prev_state; + + state = &cmdline_states[i]; + prev_state = &cmdline_states[i - 1]; + + if (validate_cmdline_cstate(state, prev_state)) { + pr_err("C-state '%s' validation failed\n", state->name); + goto error; + } + } + + /* Copy the adjusted C-states table back */ + for (i = 1; i < drv->state_count; i++) + drv->states[i] = cmdline_states[i]; + + pr_info("Adjusted C-states with data from 'intel_idle.table'\n"); + return; + +error: + pr_info("Failed to adjust C-states with data from 'intel_idle.table'\n"); +} + static int __init intel_idle_init(void) { const struct x86_cpu_id *id; @@ -2478,19 +2655,17 @@ static int __init intel_idle_init(void) return -ENODEV; } - pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n", - boot_cpu_data.x86_model); - intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); if (!intel_idle_cpuidle_devices) return -ENOMEM; + intel_idle_cpuidle_driver_init(&intel_idle_driver); + cmdline_table_adjust(&intel_idle_driver); + retval = intel_idle_sysfs_init(); if (retval) pr_warn("failed to initialized sysfs"); - intel_idle_cpuidle_driver_init(&intel_idle_driver); - retval = cpuidle_register_driver(&intel_idle_driver); if (retval) { struct cpuidle_driver *drv = cpuidle_get_driver(); @@ -2538,17 +2713,6 @@ module_param(max_cstate, int, 0444); module_param_named(states_off, disabled_states_mask, uint, 0444); MODULE_PARM_DESC(states_off, "Mask of disabled idle states"); /* - * Some platforms come with mutually exclusive C-states, so that if one is - * enabled, the other C-states must not be used. Example: C1 and C1E on - * Sapphire Rapids platform. This parameter allows for selecting the - * preferred C-states among the groups of mutually exclusive C-states - the - * selected C-states will be registered, the other C-states from the mutually - * exclusive group won't be registered. If the platform has no mutually - * exclusive C-states, this parameter has no effect. - */ -module_param_named(preferred_cstates, preferred_states_mask, uint, 0444); -MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states"); -/* * Debugging option that forces the driver to enter all C-states with * interrupts enabled. Does not apply to C-states with * 'CPUIDLE_FLAG_INIT_XSTATE' and 'CPUIDLE_FLAG_IBRS' flags. @@ -2560,3 +2724,21 @@ module_param(force_irq_on, bool, 0444); */ module_param(ibrs_off, bool, 0444); MODULE_PARM_DESC(ibrs_off, "Disable IBRS when idle"); + +/* + * Define the C-states table from a user input string. Expected format is + * 'name:latency:residency', where: + * - name: The C-state name. + * - latency: The C-state exit latency in us. + * - residency: The C-state target residency in us. + * + * Multiple C-states can be defined by separating them with commas: + * 'name1:latency1:residency1,name2:latency2:residency2' + * + * Example: intel_idle.table=C1:1:1,C1E:5:10,C6:100:600 + * + * To leave latency or residency unchanged, use an empty field, for example: + * 'C1:1:1,C1E::10' - leaves C1E latency unchanged. + */ +module_param_string(table, cmdline_table_str, MAX_CMDLINE_TABLE_LEN, 0444); +MODULE_PARM_DESC(table, "Build the C-states table from a user input string"); |
