summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS1
-rw-r--r--drivers/cpuidle/governors/menu.c24
-rw-r--r--drivers/cpuidle/governors/teo.c98
-rw-r--r--drivers/idle/intel_idle.c268
4 files changed, 317 insertions, 74 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 083ac5e3f3f0..7fe9a770af78 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6561,6 +6561,7 @@ F: rust/kernel/cpu.rs
CPU IDLE TIME MANAGEMENT FRAMEWORK
M: "Rafael J. Wysocki" <rafael@kernel.org>
M: Daniel Lezcano <daniel.lezcano@linaro.org>
+R: Christian Loehle <christian.loehle@arm.com>
L: linux-pm@vger.kernel.org
S: Maintained
B: https://bugzilla.kernel.org
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 64d6f7a1c776..c6052055ba0f 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -239,7 +239,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
/* Find the shortest expected idle interval. */
predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
- if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
+ if (predicted_ns > RESIDENCY_THRESHOLD_NS || tick_nohz_tick_stopped()) {
unsigned int timer_us;
/* Determine the time till the closest timer. */
@@ -259,6 +259,16 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
RESOLUTION * DECAY * NSEC_PER_USEC);
/* Use the lowest expected idle interval to pick the idle state. */
predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns);
+ /*
+ * If the tick is already stopped, the cost of possible short
+ * idle duration misprediction is much higher, because the CPU
+ * may be stuck in a shallow idle state for a long time as a
+ * result of it. In that case, say we might mispredict and use
+ * the known time till the closest timer event for the idle
+ * state selection.
+ */
+ if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC)
+ predicted_ns = data->next_timer_ns;
} else {
/*
* Because the next timer event is not going to be determined
@@ -271,7 +281,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
data->bucket = BUCKETS - 1;
}
- if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
+ if (drv->state_count <= 1 || latency_req == 0 ||
((data->next_timer_ns < drv->states[1].target_residency_ns ||
latency_req < drv->states[1].exit_latency_ns) &&
!dev->states_usage[0].disable)) {
@@ -285,16 +295,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
}
/*
- * If the tick is already stopped, the cost of possible short idle
- * duration misprediction is much higher, because the CPU may be stuck
- * in a shallow idle state for a long time as a result of it. In that
- * case, say we might mispredict and use the known time till the closest
- * timer event for the idle state selection.
- */
- if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC)
- predicted_ns = data->next_timer_ns;
-
- /*
* Find the idle state with the lowest power while satisfying
* our constraints.
*/
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 81ac5fd58a1c..80f3ba942a06 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -48,12 +48,11 @@
* in accordance with what happened last time.
*
* The "hits" metric reflects the relative frequency of situations in which the
- * sleep length and the idle duration measured after CPU wakeup fall into the
- * same bin (that is, the CPU appears to wake up "on time" relative to the sleep
- * length). In turn, the "intercepts" metric reflects the relative frequency of
- * non-timer wakeup events for which the measured idle duration falls into a bin
- * that corresponds to an idle state shallower than the one whose bin is fallen
- * into by the sleep length (these events are also referred to as "intercepts"
+ * sleep length and the idle duration measured after CPU wakeup are close enough
+ * (that is, the CPU appears to wake up "on time" relative to the sleep length).
+ * In turn, the "intercepts" metric reflects the relative frequency of non-timer
+ * wakeup events for which the measured idle duration is significantly different
+ * from the sleep length (these events are also referred to as "intercepts"
* below).
*
* The governor also counts "intercepts" with the measured idle duration below
@@ -75,12 +74,17 @@
* than the candidate one (it represents the cases in which the CPU was
* likely woken up by a non-timer wakeup source).
*
+ * Also find the idle state with the maximum intercepts metric (if there are
+ * multiple states with the maximum intercepts metric, choose the one with
+ * the highest index).
+ *
* 2. If the second sum computed in step 1 is greater than a half of the sum of
* both metrics for the candidate state bin and all subsequent bins (if any),
* a shallower idle state is likely to be more suitable, so look for it.
*
* - Traverse the enabled idle states shallower than the candidate one in the
- * descending order.
+ * descending order, starting at the state with the maximum intercepts
+ * metric found in step 1.
*
* - For each of them compute the sum of the "intercepts" metrics over all
* of the idle states between it and the candidate one (including the
@@ -167,6 +171,7 @@ static void teo_decay(unsigned int *metric)
*/
static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
{
+ s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
int i, idx_timer = 0, idx_duration = 0;
s64 target_residency_ns, measured_ns;
@@ -182,8 +187,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
*/
measured_ns = S64_MAX;
} else {
- s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
-
measured_ns = dev->last_residency_ns;
/*
* The delay between the wakeup and the first instruction
@@ -239,15 +242,31 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
cpu_data->state_bins[drv->state_count-1].hits += PULSE;
return;
}
+ /*
+ * If intercepts within the tick period range are not frequent
+ * enough, count this wakeup as a hit, since it is likely that
+ * the tick has woken up the CPU because an expected intercept
+ * was not there. Otherwise, one of the intercepts may have
+ * been incidentally preceded by the tick wakeup.
+ */
+ if (3 * cpu_data->tick_intercepts < 2 * total) {
+ cpu_data->state_bins[idx_timer].hits += PULSE;
+ return;
+ }
}
/*
- * If the measured idle duration falls into the same bin as the sleep
- * length, this is a "hit", so update the "hits" metric for that bin.
+ * If the measured idle duration (adjusted for the entered state exit
+ * latency) falls into the same bin as the sleep length and the latter
+ * is less than the "raw" measured idle duration (so the wakeup appears
+ * to have occurred after the anticipated timer event), this is a "hit",
+ * so update the "hits" metric for that bin.
+ *
* Otherwise, update the "intercepts" metric for the bin fallen into by
* the measured idle duration.
*/
- if (idx_timer == idx_duration) {
+ if (idx_timer == idx_duration &&
+ cpu_data->sleep_length_ns - measured_ns < lat_ns / 2) {
cpu_data->state_bins[idx_timer].hits += PULSE;
} else {
cpu_data->state_bins[idx_duration].intercepts += PULSE;
@@ -294,8 +313,10 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
ktime_t delta_tick = TICK_NSEC / 2;
unsigned int idx_intercept_sum = 0;
unsigned int intercept_sum = 0;
+ unsigned int intercept_max = 0;
unsigned int idx_hit_sum = 0;
unsigned int hit_sum = 0;
+ int intercept_max_idx = -1;
int constraint_idx = 0;
int idx0 = 0, idx = -1;
s64 duration_ns;
@@ -326,17 +347,32 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
if (!dev->states_usage[0].disable)
idx = 0;
- /* Compute the sums of metrics for early wakeup pattern detection. */
+ /*
+ * Compute the sums of metrics for early wakeup pattern detection and
+ * look for the state bin with the maximum intercepts metric below the
+ * deepest enabled one (if there are multiple states with the maximum
+ * intercepts metric, choose the one with the highest index).
+ */
for (i = 1; i < drv->state_count; i++) {
struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
+ unsigned int prev_intercepts = prev_bin->intercepts;
struct cpuidle_state *s = &drv->states[i];
/*
* Update the sums of idle state metrics for all of the states
* shallower than the current one.
*/
- intercept_sum += prev_bin->intercepts;
hit_sum += prev_bin->hits;
+ intercept_sum += prev_intercepts;
+ /*
+ * Check if this is the bin with the maximum number of
+ * intercepts so far and in that case update the index of
+ * the state with the maximum intercepts metric.
+ */
+ if (prev_intercepts >= intercept_max) {
+ intercept_max = prev_intercepts;
+ intercept_max_idx = i - 1;
+ }
if (dev->states_usage[i].disable)
continue;
@@ -388,12 +424,34 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
while (min_idx < idx &&
drv->states[min_idx].target_residency_ns < TICK_NSEC)
min_idx++;
+
+ /*
+ * Avoid selecting a state with a lower index, but with
+ * the same target residency as the current candidate
+ * one.
+ */
+ if (drv->states[min_idx].target_residency_ns ==
+ drv->states[idx].target_residency_ns)
+ goto constraint;
+ }
+
+ /*
+ * If the minimum state index is greater than or equal to the
+ * index of the state with the maximum intercepts metric and
+ * the corresponding state is enabled, there is no need to look
+ * at the deeper states.
+ */
+ if (min_idx >= intercept_max_idx &&
+ !dev->states_usage[min_idx].disable) {
+ idx = min_idx;
+ goto constraint;
}
/*
- * Look for the deepest idle state whose target residency had
- * not exceeded the idle duration in over a half of the relevant
- * cases in the past.
+ * Look for the deepest enabled idle state, at most as deep as
+ * the one with the maximum intercepts metric, whose target
+ * residency had not been greater than the idle duration in over
+ * a half of the relevant cases in the past.
*
* Take the possible duration limitation present if the tick
* has been stopped already into account.
@@ -405,11 +463,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
continue;
idx = i;
- if (2 * intercept_sum > idx_intercept_sum)
+ if (2 * intercept_sum > idx_intercept_sum &&
+ i <= intercept_max_idx)
break;
}
}
+constraint:
/*
* If there is a latency constraint, it may be necessary to select an
* idle state shallower than the current candidate one.
@@ -464,7 +524,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
* total wakeup events, do not stop the tick.
*/
if (drv->states[idx].target_residency_ns < TICK_NSEC &&
- cpu_data->tick_intercepts > cpu_data->total / 2 + cpu_data->total / 8)
+ 3 * cpu_data->tick_intercepts >= 2 * cpu_data->total)
duration_ns = TICK_NSEC / 2;
end:
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 9ba83954c255..f49c939d636f 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -45,6 +45,7 @@
#include <linux/kernel.h>
#include <linux/cpuidle.h>
#include <linux/tick.h>
+#include <linux/time64.h>
#include <trace/events/power.h>
#include <linux/sched.h>
#include <linux/sched/smt.h>
@@ -63,8 +64,6 @@
#include <asm/fpu/api.h>
#include <asm/smp.h>
-#define INTEL_IDLE_VERSION "0.5.1"
-
static struct cpuidle_driver intel_idle_driver = {
.name = "intel_idle",
.owner = THIS_MODULE,
@@ -72,10 +71,18 @@ static struct cpuidle_driver intel_idle_driver = {
/* intel_idle.max_cstate=0 disables driver */
static int max_cstate = CPUIDLE_STATE_MAX - 1;
static unsigned int disabled_states_mask __read_mostly;
-static unsigned int preferred_states_mask __read_mostly;
static bool force_irq_on __read_mostly;
static bool ibrs_off __read_mostly;
+/* The maximum allowed length for the 'table' module parameter */
+#define MAX_CMDLINE_TABLE_LEN 256
+/* Maximum allowed C-state latency */
+#define MAX_CMDLINE_LATENCY_US (5 * USEC_PER_MSEC)
+/* Maximum allowed C-state target residency */
+#define MAX_CMDLINE_RESIDENCY_US (100 * USEC_PER_MSEC)
+
+static char cmdline_table_str[MAX_CMDLINE_TABLE_LEN] __read_mostly;
+
static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
static unsigned long auto_demotion_disable_flags;
@@ -107,6 +114,9 @@ static struct device *sysfs_root __initdata;
static const struct idle_cpu *icpu __initdata;
static struct cpuidle_state *cpuidle_state_table __initdata;
+/* C-states data from the 'intel_idle.table' cmdline parameter */
+static struct cpuidle_state cmdline_states[CPUIDLE_STATE_MAX] __initdata;
+
static unsigned int mwait_substates __initdata;
/*
@@ -2052,25 +2062,6 @@ static void __init skx_idle_state_table_update(void)
}
/**
- * adl_idle_state_table_update - Adjust AlderLake idle states table.
- */
-static void __init adl_idle_state_table_update(void)
-{
- /* Check if user prefers C1 over C1E. */
- if (preferred_states_mask & BIT(1) && !(preferred_states_mask & BIT(2))) {
- cpuidle_state_table[0].flags &= ~CPUIDLE_FLAG_UNUSABLE;
- cpuidle_state_table[1].flags |= CPUIDLE_FLAG_UNUSABLE;
-
- /* Disable C1E by clearing the "C1E promotion" bit. */
- c1e_promotion = C1E_PROMOTION_DISABLE;
- return;
- }
-
- /* Make sure C1E is enabled by default */
- c1e_promotion = C1E_PROMOTION_ENABLE;
-}
-
-/**
* spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
*/
static void __init spr_idle_state_table_update(void)
@@ -2176,11 +2167,6 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
case INTEL_EMERALDRAPIDS_X:
spr_idle_state_table_update();
break;
- case INTEL_ALDERLAKE:
- case INTEL_ALDERLAKE_L:
- case INTEL_ATOM_GRACEMONT:
- adl_idle_state_table_update();
- break;
case INTEL_ATOM_SILVERMONT:
case INTEL_ATOM_AIRMONT:
byt_cht_auto_demotion_disable();
@@ -2420,6 +2406,197 @@ static void __init intel_idle_sysfs_uninit(void)
put_device(sysfs_root);
}
+ /**
+ * get_cmdline_field - Get the current field from a cmdline string.
+ * @args: The cmdline string to get the current field from.
+ * @field: Pointer to the current field upon return.
+ * @sep: The fields separator character.
+ *
+ * Examples:
+ * Input: args="C1:1:1,C1E:2:10", sep=':'
+ * Output: field="C1", return "1:1,C1E:2:10"
+ * Input: args="C1:1:1,C1E:2:10", sep=','
+ * Output: field="C1:1:1", return "C1E:2:10"
+ * Ipnut: args="::", sep=':'
+ * Output: field="", return ":"
+ *
+ * Return: The continuation of the cmdline string after the field or NULL.
+ */
+static char *get_cmdline_field(char *args, char **field, char sep)
+{
+ unsigned int i;
+
+ for (i = 0; args[i] && !isspace(args[i]); i++) {
+ if (args[i] == sep)
+ break;
+ }
+
+ *field = args;
+
+ if (args[i] != sep)
+ return NULL;
+
+ args[i] = '\0';
+ return args + i + 1;
+}
+
+/**
+ * validate_cmdline_cstate - Validate a C-state from cmdline.
+ * @state: The C-state to validate.
+ * @prev_state: The previous C-state in the table or NULL.
+ *
+ * Return: 0 if the C-state is valid or -EINVAL otherwise.
+ */
+static int validate_cmdline_cstate(struct cpuidle_state *state,
+ struct cpuidle_state *prev_state)
+{
+ if (state->exit_latency == 0)
+ /* Exit latency 0 can only be used for the POLL state */
+ return -EINVAL;
+
+ if (state->exit_latency > MAX_CMDLINE_LATENCY_US)
+ return -EINVAL;
+
+ if (state->target_residency > MAX_CMDLINE_RESIDENCY_US)
+ return -EINVAL;
+
+ if (state->target_residency < state->exit_latency)
+ return -EINVAL;
+
+ if (!prev_state)
+ return 0;
+
+ if (state->exit_latency <= prev_state->exit_latency)
+ return -EINVAL;
+
+ if (state->target_residency <= prev_state->target_residency)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * cmdline_table_adjust - Adjust the C-states table with data from cmdline.
+ * @drv: cpuidle driver (assumed to point to intel_idle_driver).
+ *
+ * Adjust the C-states table with data from the 'intel_idle.table' module
+ * parameter (if specified).
+ */
+static void __init cmdline_table_adjust(struct cpuidle_driver *drv)
+{
+ char *args = cmdline_table_str;
+ struct cpuidle_state *state;
+ int i;
+
+ if (args[0] == '\0')
+ /* The 'intel_idle.table' module parameter was not specified */
+ return;
+
+ /* Create a copy of the C-states table */
+ for (i = 0; i < drv->state_count; i++)
+ cmdline_states[i] = drv->states[i];
+
+ /*
+ * Adjust the C-states table copy with data from the 'intel_idle.table'
+ * module parameter.
+ */
+ while (args) {
+ char *fields, *name, *val;
+
+ /*
+ * Get the next C-state definition, which is expected to be
+ * '<name>:<latency_us>:<target_residency_us>'. Treat "empty"
+ * fields as unchanged. For example,
+ * '<name>::<target_residency_us>' leaves the latency unchanged.
+ */
+ args = get_cmdline_field(args, &fields, ',');
+
+ /* name */
+ fields = get_cmdline_field(fields, &name, ':');
+ if (!fields)
+ goto error;
+
+ if (!strcmp(name, "POLL")) {
+ pr_err("Cannot adjust POLL\n");
+ continue;
+ }
+
+ /* Find the C-state by its name */
+ state = NULL;
+ for (i = 0; i < drv->state_count; i++) {
+ if (!strcmp(name, drv->states[i].name)) {
+ state = &cmdline_states[i];
+ break;
+ }
+ }
+
+ if (!state) {
+ pr_err("C-state '%s' was not found\n", name);
+ continue;
+ }
+
+ /* Latency */
+ fields = get_cmdline_field(fields, &val, ':');
+ if (!fields)
+ goto error;
+
+ if (*val) {
+ if (kstrtouint(val, 0, &state->exit_latency))
+ goto error;
+ }
+
+ /* Target residency */
+ fields = get_cmdline_field(fields, &val, ':');
+
+ if (*val) {
+ if (kstrtouint(val, 0, &state->target_residency))
+ goto error;
+ }
+
+ /*
+ * Allow for 3 more fields, but ignore them. Helps to make
+ * possible future extensions of the cmdline format backward
+ * compatible.
+ */
+ for (i = 0; fields && i < 3; i++) {
+ fields = get_cmdline_field(fields, &val, ':');
+ if (!fields)
+ break;
+ }
+
+ if (fields) {
+ pr_err("Too many fields for C-state '%s'\n", state->name);
+ goto error;
+ }
+
+ pr_info("C-state from cmdline: name=%s, latency=%u, residency=%u\n",
+ state->name, state->exit_latency, state->target_residency);
+ }
+
+ /* Validate the adjusted C-states, start with index 1 to skip POLL */
+ for (i = 1; i < drv->state_count; i++) {
+ struct cpuidle_state *prev_state;
+
+ state = &cmdline_states[i];
+ prev_state = &cmdline_states[i - 1];
+
+ if (validate_cmdline_cstate(state, prev_state)) {
+ pr_err("C-state '%s' validation failed\n", state->name);
+ goto error;
+ }
+ }
+
+ /* Copy the adjusted C-states table back */
+ for (i = 1; i < drv->state_count; i++)
+ drv->states[i] = cmdline_states[i];
+
+ pr_info("Adjusted C-states with data from 'intel_idle.table'\n");
+ return;
+
+error:
+ pr_info("Failed to adjust C-states with data from 'intel_idle.table'\n");
+}
+
static int __init intel_idle_init(void)
{
const struct x86_cpu_id *id;
@@ -2478,19 +2655,17 @@ static int __init intel_idle_init(void)
return -ENODEV;
}
- pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
- boot_cpu_data.x86_model);
-
intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
if (!intel_idle_cpuidle_devices)
return -ENOMEM;
+ intel_idle_cpuidle_driver_init(&intel_idle_driver);
+ cmdline_table_adjust(&intel_idle_driver);
+
retval = intel_idle_sysfs_init();
if (retval)
pr_warn("failed to initialized sysfs");
- intel_idle_cpuidle_driver_init(&intel_idle_driver);
-
retval = cpuidle_register_driver(&intel_idle_driver);
if (retval) {
struct cpuidle_driver *drv = cpuidle_get_driver();
@@ -2538,17 +2713,6 @@ module_param(max_cstate, int, 0444);
module_param_named(states_off, disabled_states_mask, uint, 0444);
MODULE_PARM_DESC(states_off, "Mask of disabled idle states");
/*
- * Some platforms come with mutually exclusive C-states, so that if one is
- * enabled, the other C-states must not be used. Example: C1 and C1E on
- * Sapphire Rapids platform. This parameter allows for selecting the
- * preferred C-states among the groups of mutually exclusive C-states - the
- * selected C-states will be registered, the other C-states from the mutually
- * exclusive group won't be registered. If the platform has no mutually
- * exclusive C-states, this parameter has no effect.
- */
-module_param_named(preferred_cstates, preferred_states_mask, uint, 0444);
-MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");
-/*
* Debugging option that forces the driver to enter all C-states with
* interrupts enabled. Does not apply to C-states with
* 'CPUIDLE_FLAG_INIT_XSTATE' and 'CPUIDLE_FLAG_IBRS' flags.
@@ -2560,3 +2724,21 @@ module_param(force_irq_on, bool, 0444);
*/
module_param(ibrs_off, bool, 0444);
MODULE_PARM_DESC(ibrs_off, "Disable IBRS when idle");
+
+/*
+ * Define the C-states table from a user input string. Expected format is
+ * 'name:latency:residency', where:
+ * - name: The C-state name.
+ * - latency: The C-state exit latency in us.
+ * - residency: The C-state target residency in us.
+ *
+ * Multiple C-states can be defined by separating them with commas:
+ * 'name1:latency1:residency1,name2:latency2:residency2'
+ *
+ * Example: intel_idle.table=C1:1:1,C1E:5:10,C6:100:600
+ *
+ * To leave latency or residency unchanged, use an empty field, for example:
+ * 'C1:1:1,C1E::10' - leaves C1E latency unchanged.
+ */
+module_param_string(table, cmdline_table_str, MAX_CMDLINE_TABLE_LEN, 0444);
+MODULE_PARM_DESC(table, "Build the C-states table from a user input string");