From e560083c0467f86b72aecac377b27bd1e7d16c49 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Fri, 30 Jan 2026 12:49:40 +0530 Subject: OPP: debugfs: Use performance level if available to distinguish between rates Some OPP tables have entries with same rate and different performance level. For these entries, using only the rate as the debugfs directory name causes below error: debugfs: 'opp:5000000' already exists in 'soc@0-1c00000.pci' Fix it by appending the performance level to the dir name if available. Reported-by: Bjorn Andersson Closes: https://lore.kernel.org/linux-arm-msm/75lzykd37zdvrks5i2bb4zb2yzjtm25kv3hegmikndkbr772mz@w2ykff3ny45u/ Fixes: 05db35963eef ("OPP: Add support to find OPP for a set of keys") Signed-off-by: Manivannan Sadhasivam Signed-off-by: Viresh Kumar --- drivers/opp/debugfs.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c index 8fc6238b1728..61506d30d5ff 100644 --- a/drivers/opp/debugfs.c +++ b/drivers/opp/debugfs.c @@ -130,22 +130,24 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table) { struct dentry *pdentry = opp_table->dentry; struct dentry *d; - unsigned long id; - char name[25]; /* 20 chars for 64 bit value + 5 (opp:\0) */ + char name[36]; /* "opp:"(4) + u64(20) + "-" (1) + u32(10) + NULL(1) */ /* * Get directory name for OPP. * - * - Normally rate is unique to each OPP, use it to get unique opp-name. + * - Normally rate is unique to each OPP, use it to get unique opp-name, + * together with performance level if available. * - For some devices rate isn't available or there are multiple, use * index instead for them. */ - if (likely(opp_table->clk_count == 1 && opp->rates[0])) - id = opp->rates[0]; - else - id = _get_opp_count(opp_table); - - snprintf(name, sizeof(name), "opp:%lu", id); + if (likely(opp_table->clk_count == 1 && opp->rates[0])) { + if (opp->level == OPP_LEVEL_UNSET) + snprintf(name, sizeof(name), "opp:%lu", opp->rates[0]); + else + snprintf(name, sizeof(name), "opp:%lu-%u", opp->rates[0], opp->level); + } else { + snprintf(name, sizeof(name), "opp:%u", _get_opp_count(opp_table)); + } /* Create per-opp directory */ d = debugfs_create_dir(name, pdentry); -- cgit v1.2.3 From 3d2398f44a2d48fb1c575a6e0bc6b38f3e689e22 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 23 Feb 2026 11:05:59 +0530 Subject: OPP: Move break out of scoped_guard in dev_pm_opp_xlate_required_opp() The commit ff9c512041f2 ("OPP: Use mutex locking guards") unintentionally made the for loop run longer than required. scoped_guard() is implemented as a for loop. The break statement now breaks out out the scoped_guard() and not out of the outer for loop. The outer loop always iterates to completion. Fix it. Fixes: ff9c512041f2 ("OPP: Use mutex locking guards") Reported-by: David Lechner Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 866641666e41..da3f5eba4341 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -2742,8 +2742,8 @@ struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, break; } } - break; } + break; } if (IS_ERR(dest_opp)) { -- cgit v1.2.3 From a42622e30f89fee66c1d55594348fe4df1e21669 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Mon, 23 Feb 2026 10:50:18 +0200 Subject: dt-bindings: cpufreq: qcom-hw: document Eliza cpufreq hardware Document the cpufreq hardware on the Eliza SoC. Signed-off-by: Abel Vesa Signed-off-by: Viresh Kumar --- Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml index 22eeaef14f55..98eb36bff172 100644 --- a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml @@ -35,6 +35,7 @@ properties: - description: v2 of CPUFREQ HW (EPSS) items: - enum: + - qcom,eliza-cpufreq-epss - qcom,milos-cpufreq-epss - qcom,qcs8300-cpufreq-epss - qcom,qdu1000-cpufreq-epss -- cgit v1.2.3 From a3e93d8733490a72abb4e27e01d88f5bdc741271 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 26 Feb 2026 16:12:11 +0100 Subject: dt-bindings: arm: nvidia: Document the Tegra238 CCPLEX cluster Tegra238 is derived from Tegra234 and uses a similar CCPLEX cluster, with slight variations but the same programming model. Add a compatible string to specify this particular implementation. Signed-off-by: Thierry Reding Acked-by: Conor Dooley Signed-off-by: Viresh Kumar --- .../devicetree/bindings/arm/tegra/nvidia,tegra-ccplex-cluster.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra-ccplex-cluster.yaml b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra-ccplex-cluster.yaml index 36dbd0838f2d..fe9c8791f227 100644 --- a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra-ccplex-cluster.yaml +++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra-ccplex-cluster.yaml @@ -24,6 +24,7 @@ properties: enum: - nvidia,tegra186-ccplex-cluster - nvidia,tegra234-ccplex-cluster + - nvidia,tegra238-ccplex-cluster reg: maxItems: 1 -- cgit v1.2.3 From 2d14bf98e6b445afbb799b170448f628f6d4ab69 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 26 Feb 2026 16:12:12 +0100 Subject: cpufreq: tegra194: Rename Tegra239 to Tegra238 This chip identifies as Tegra238, so update the device tree compatible string and data structures associated with it to use the correct name. Signed-off-by: Thierry Reding Signed-off-by: Viresh Kumar --- drivers/cpufreq/tegra194-cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c index 7a41cfc71a46..c6375e14d445 100644 --- a/drivers/cpufreq/tegra194-cpufreq.c +++ b/drivers/cpufreq/tegra194-cpufreq.c @@ -196,7 +196,7 @@ static const struct tegra_cpufreq_soc tegra234_cpufreq_soc = { .refclk_delta_min = 16000, }; -static const struct tegra_cpufreq_soc tegra239_cpufreq_soc = { +static const struct tegra_cpufreq_soc tegra238_cpufreq_soc = { .ops = &tegra234_cpufreq_ops, .actmon_cntr_base = 0x4000, .maxcpus_per_cluster = 8, @@ -807,7 +807,7 @@ static void tegra194_cpufreq_remove(struct platform_device *pdev) static const struct of_device_id tegra194_cpufreq_of_match[] = { { .compatible = "nvidia,tegra194-ccplex", .data = &tegra194_cpufreq_soc }, { .compatible = "nvidia,tegra234-ccplex-cluster", .data = &tegra234_cpufreq_soc }, - { .compatible = "nvidia,tegra239-ccplex-cluster", .data = &tegra239_cpufreq_soc }, + { .compatible = "nvidia,tegra238-ccplex-cluster", .data = &tegra238_cpufreq_soc }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, tegra194_cpufreq_of_match); -- cgit v1.2.3 From e57c2bf2e89df3b176ab579abfd3ed54fd27034c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Feb 2026 16:38:55 +0100 Subject: cpuidle: governors: menu: Refine stopped tick handling This change is based on the observation that it is not in fact necessary to select a deep idle state every time the scheduler tick has been stopped before the idle state selection takes place. Namely, if the time till the closest timer (that is not the tick) is short enough, a shallow idle state can be selected because the timer will kick the CPU out of that state, so the damage from a possible overly optimistic selection will be limited. Update the menu governor in accordance with the above and use twice the tick period length as the "safe timer range" for allowing the original predicted_ns value to be used even if the tick has been stopped. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/3341782.5fSG56mABF@rafael.j.wysocki --- drivers/cpuidle/governors/gov.h | 5 +++++ drivers/cpuidle/governors/menu.c | 15 +++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/governors/gov.h b/drivers/cpuidle/governors/gov.h index 99e067d9668c..cd06a2e7b506 100644 --- a/drivers/cpuidle/governors/gov.h +++ b/drivers/cpuidle/governors/gov.h @@ -10,5 +10,10 @@ * check the time till the closest expected timer event. */ #define RESIDENCY_THRESHOLD_NS (15 * NSEC_PER_USEC) +/* + * If the closest timer is in this range, the governor idle state selection need + * not be adjusted after the scheduler tick has been stopped. + */ +#define SAFE_TIMER_RANGE_NS (2 * TICK_NSEC) #endif /* __CPUIDLE_GOVERNOR_H */ diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 899ff16ff1fe..544a5d593007 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -261,13 +261,16 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns); /* * If the tick is already stopped, the cost of possible short - * idle duration misprediction is much higher, because the CPU - * may be stuck in a shallow idle state for a long time as a - * result of it. In that case, say we might mispredict and use - * the known time till the closest timer event for the idle - * state selection. + * idle duration misprediction is higher because the CPU may get + * stuck in a shallow idle state then. To avoid that, if + * predicted_ns is small enough, say it might be mispredicted + * and use the known time till the closest timer for idle state + * selection unless that timer is going to trigger within + * SAFE_TIMER_RANGE_NS in which case it can be regarded as a + * sufficient safety net. */ - if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC) + if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC && + data->next_timer_ns > SAFE_TIMER_RANGE_NS) predicted_ns = data->next_timer_ns; } else { /* -- cgit v1.2.3 From 106a2662e655363db0dd73d9a91e1ec9afe9f4b1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Feb 2026 16:40:18 +0100 Subject: cpuidle: governors: teo: Rearrange stopped tick handling This change is based on the observation that it is not in fact necessary to select a deep idle state every time the scheduler tick has been stopped before the idle state selection takes place. Namely, if the time till the closest timer (that is not the tick) is short enough, a shallow idle state can be selected because the timer will kick the CPU out of that state, so the damage from a possible overly optimistic selection will be limited. Update the teo governor in accordance with the above in analogy with the previous analogous menu governor update. Among other things, this will cause the teo governor to call tick_nohz_get_sleep_length() every time when the tick has been stopped already and only change the original idle state selection if the time till the closest timer is beyond SAFE_TIMER_RANGE_NS which is way more straightforward than the current code flow. Of course, this effectively throws away some of the recent teo governor changes made recently, but the resulting simplification is worth it in my view. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/1865078.VLH7GnMWUR@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 81 +++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 47 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index bec0142377b8..ac43b9b013b3 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -407,50 +407,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * better choice. */ if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) { - int min_idx = idx0; - - if (tick_nohz_tick_stopped()) { - /* - * Look for the shallowest idle state below the current - * candidate one whose target residency is at least - * equal to the tick period length. - */ - while (min_idx < idx && - drv->states[min_idx].target_residency_ns < TICK_NSEC) - min_idx++; - - /* - * Avoid selecting a state with a lower index, but with - * the same target residency as the current candidate - * one. - */ - if (drv->states[min_idx].target_residency_ns == - drv->states[idx].target_residency_ns) - goto constraint; - } - - /* - * If the minimum state index is greater than or equal to the - * index of the state with the maximum intercepts metric and - * the corresponding state is enabled, there is no need to look - * at the deeper states. - */ - if (min_idx >= intercept_max_idx && - !dev->states_usage[min_idx].disable) { - idx = min_idx; - goto constraint; - } - /* * Look for the deepest enabled idle state, at most as deep as * the one with the maximum intercepts metric, whose target * residency had not been greater than the idle duration in over * a half of the relevant cases in the past. - * - * Take the possible duration limitation present if the tick - * has been stopped already into account. */ - for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) { + for (i = idx - 1, intercept_sum = 0; i >= idx0; i--) { intercept_sum += cpu_data->state_bins[i].intercepts; if (dev->states_usage[i].disable) @@ -463,7 +426,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } } -constraint: /* * If there is a latency constraint, it may be necessary to select an * idle state shallower than the current candidate one. @@ -472,13 +434,13 @@ constraint: idx = constraint_idx; /* - * If either the candidate state is state 0 or its target residency is - * low enough, there is basically nothing more to do, but if the sleep - * length is not updated, the subsequent wakeup will be counted as an - * "intercept" which may be problematic in the cases when timer wakeups - * are dominant. Namely, it may effectively prevent deeper idle states - * from being selected at one point even if no imminent timers are - * scheduled. + * If the tick has not been stopped and either the candidate state is + * state 0 or its target residency is low enough, there is basically + * nothing more to do, but if the sleep length is not updated, the + * subsequent wakeup will be counted as an "intercept". That may be + * problematic in the cases when timer wakeups are dominant because it + * may effectively prevent deeper idle states from being selected at one + * point even if no imminent timers are scheduled. * * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one * CPU are unlikely (user space has a default 50 us slack value for @@ -494,7 +456,8 @@ constraint: * shallow idle states regardless of the wakeup type, so the sleep * length need not be known in that case. */ - if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) && + if (!tick_nohz_tick_stopped() && (!idx || + drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) && (2 * cpu_data->short_idles >= cpu_data->total || latency_req < LATENCY_THRESHOLD_NS)) goto out_tick; @@ -502,6 +465,30 @@ constraint: duration_ns = tick_nohz_get_sleep_length(&delta_tick); cpu_data->sleep_length_ns = duration_ns; + /* + * If the tick has been stopped and the closest timer is too far away, + * update the selection to prevent the CPU from getting stuck in a + * shallow idle state for too long. + */ + if (tick_nohz_tick_stopped() && duration_ns > SAFE_TIMER_RANGE_NS && + drv->states[idx].target_residency_ns < TICK_NSEC) { + /* + * Look for the deepest enabled idle state with exit latency + * within the PM QoS limit and with target residency within + * duration_ns. + */ + for (i = constraint_idx; i > idx; i--) { + if (dev->states_usage[i].disable) + continue; + + if (drv->states[i].target_residency_ns <= duration_ns) { + idx = i; + break; + } + } + return idx; + } + if (!idx) goto out_tick; -- cgit v1.2.3 From 54de61a3f6894556364e2164602c7eaa943581be Mon Sep 17 00:00:00 2001 From: Pengjie Zhang Date: Thu, 29 Jan 2026 20:18:13 +0800 Subject: cpufreq: Add debug print for current frequency in __cpufreq_driver_target() Include policy->cur in the debug message to explicitly show the frequency transition (from current to target). Signed-off-by: Pengjie Zhang Acked-by: Viresh Kumar Link: https://patch.msgid.link/20260129121813.3874516-1-zhangpengjie2@huawei.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 277884d91913..2082a9e4384f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2367,8 +2367,8 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy, target_freq = __resolve_freq(policy, target_freq, policy->min, policy->max, relation); - pr_debug("target for CPU %u: %u kHz, relation %u, requested %u kHz\n", - policy->cpu, target_freq, relation, old_target_freq); + pr_debug("CPU %u: cur %u kHz -> target %u kHz (req %u kHz, rel %u)\n", + policy->cpu, policy->cur, target_freq, old_target_freq, relation); /* * This might look like a redundant call as we are checking it again -- cgit v1.2.3 From 13060743a441700f6498942d2d0fea4fef0e9d65 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:33 -0800 Subject: powercap: intel_rapl: Add a symbol namespace for intel_rapl exports Cleanup of the intel_rapl common driver requires introducing additional exported helper and lifecycle functions. Before adding new exports, create a dedicated symbol namespace for intel_rapl and update the relevant interface drivers to explicitly import it. This makes the intended usage of these symbols explicit, avoids polluting the global namespace, and prepares the codebase for the ongoing RAPL refactoring. No functional changes are intended. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-2-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 20 ++++++++++---------- drivers/powercap/intel_rapl_msr.c | 1 + drivers/powercap/intel_rapl_tpmi.c | 1 + .../intel/int340x_thermal/processor_thermal_rapl.c | 1 + 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 019a65a5283a..f05a72d7d340 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -2083,7 +2083,7 @@ int rapl_package_add_pmu_locked(struct rapl_package *rp) return rapl_pmu_update(rp); } -EXPORT_SYMBOL_GPL(rapl_package_add_pmu_locked); +EXPORT_SYMBOL_NS_GPL(rapl_package_add_pmu_locked, "INTEL_RAPL"); int rapl_package_add_pmu(struct rapl_package *rp) { @@ -2091,7 +2091,7 @@ int rapl_package_add_pmu(struct rapl_package *rp) return rapl_package_add_pmu_locked(rp); } -EXPORT_SYMBOL_GPL(rapl_package_add_pmu); +EXPORT_SYMBOL_NS_GPL(rapl_package_add_pmu, "INTEL_RAPL"); void rapl_package_remove_pmu_locked(struct rapl_package *rp) { @@ -2109,7 +2109,7 @@ void rapl_package_remove_pmu_locked(struct rapl_package *rp) perf_pmu_unregister(&rapl_pmu.pmu); memset(&rapl_pmu, 0, sizeof(struct rapl_pmu)); } -EXPORT_SYMBOL_GPL(rapl_package_remove_pmu_locked); +EXPORT_SYMBOL_NS_GPL(rapl_package_remove_pmu_locked, "INTEL_RAPL"); void rapl_package_remove_pmu(struct rapl_package *rp) { @@ -2117,7 +2117,7 @@ void rapl_package_remove_pmu(struct rapl_package *rp) rapl_package_remove_pmu_locked(rp); } -EXPORT_SYMBOL_GPL(rapl_package_remove_pmu); +EXPORT_SYMBOL_NS_GPL(rapl_package_remove_pmu, "INTEL_RAPL"); #endif /* called from CPU hotplug notifier, hotplug lock held */ @@ -2150,14 +2150,14 @@ void rapl_remove_package_cpuslocked(struct rapl_package *rp) list_del(&rp->plist); kfree(rp); } -EXPORT_SYMBOL_GPL(rapl_remove_package_cpuslocked); +EXPORT_SYMBOL_NS_GPL(rapl_remove_package_cpuslocked, "INTEL_RAPL"); void rapl_remove_package(struct rapl_package *rp) { guard(cpus_read_lock)(); rapl_remove_package_cpuslocked(rp); } -EXPORT_SYMBOL_GPL(rapl_remove_package); +EXPORT_SYMBOL_NS_GPL(rapl_remove_package, "INTEL_RAPL"); /* * RAPL Package energy counter scope: @@ -2200,14 +2200,14 @@ struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_ return NULL; } -EXPORT_SYMBOL_GPL(rapl_find_package_domain_cpuslocked); +EXPORT_SYMBOL_NS_GPL(rapl_find_package_domain_cpuslocked, "INTEL_RAPL"); struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu) { guard(cpus_read_lock)(); return rapl_find_package_domain_cpuslocked(id, priv, id_is_cpu); } -EXPORT_SYMBOL_GPL(rapl_find_package_domain); +EXPORT_SYMBOL_NS_GPL(rapl_find_package_domain, "INTEL_RAPL"); /* called from CPU hotplug notifier, hotplug lock held */ struct rapl_package *rapl_add_package_cpuslocked(int id, struct rapl_if_priv *priv, bool id_is_cpu) @@ -2261,14 +2261,14 @@ err_free_package: kfree(rp); return ERR_PTR(ret); } -EXPORT_SYMBOL_GPL(rapl_add_package_cpuslocked); +EXPORT_SYMBOL_NS_GPL(rapl_add_package_cpuslocked, "INTEL_RAPL"); struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu) { guard(cpus_read_lock)(); return rapl_add_package_cpuslocked(id, priv, id_is_cpu); } -EXPORT_SYMBOL_GPL(rapl_add_package); +EXPORT_SYMBOL_NS_GPL(rapl_add_package, "INTEL_RAPL"); static void power_limit_state_save(void) { diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 3d5e7f56d68a..9cd6a241df51 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -263,3 +263,4 @@ module_platform_driver(intel_rapl_msr_driver); MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit) control via MSR interface"); MODULE_AUTHOR("Zhang Rui "); MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS("INTEL_RAPL"); diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index ba956a2571d1..1fa821376dde 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -348,6 +348,7 @@ static struct auxiliary_driver intel_rapl_tpmi_driver = { module_auxiliary_driver(intel_rapl_tpmi_driver) +MODULE_IMPORT_NS("INTEL_RAPL"); MODULE_IMPORT_NS("INTEL_TPMI"); MODULE_DESCRIPTION("Intel RAPL TPMI Driver"); diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c index bf51a17c5be6..e56b18aeda71 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c @@ -111,4 +111,5 @@ void proc_thermal_rapl_remove(void) EXPORT_SYMBOL_GPL(proc_thermal_rapl_remove); MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS("INTEL_RAPL"); MODULE_DESCRIPTION("RAPL interface using MMIO"); -- cgit v1.2.3 From 4fee5b749c92db83a60032206708ed5f68098e05 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:34 -0800 Subject: powercap: intel_rapl: Cleanup coding style Improve code readability and consistency by: - Aligning macro definitions vertically - Reformatting primitive info arrays with consistent indentation - Aligning CPU ID table entries - Reorganizing macro definitions for better logical grouping - Using consistent hex formatting (0x00 instead of 0) - Capitalizing hex digits consistently (0xDF instead of 0xdf) - Removing unnecessary parentheses around numeric constants No functional changes. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-3-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 446 ++++++++++++++++++----------------- 1 file changed, 227 insertions(+), 219 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index f05a72d7d340..207825246724 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -31,70 +31,95 @@ #include /* bitmasks for RAPL MSRs, used by primitive access functions */ -#define ENERGY_STATUS_MASK 0xffffffff +#define ENERGY_STATUS_MASK 0xffffffff -#define POWER_LIMIT1_MASK 0x7FFF -#define POWER_LIMIT1_ENABLE BIT(15) -#define POWER_LIMIT1_CLAMP BIT(16) +#define POWER_LIMIT1_MASK 0x7FFF +#define POWER_LIMIT1_ENABLE BIT(15) +#define POWER_LIMIT1_CLAMP BIT(16) -#define POWER_LIMIT2_MASK (0x7FFFULL<<32) -#define POWER_LIMIT2_ENABLE BIT_ULL(47) -#define POWER_LIMIT2_CLAMP BIT_ULL(48) -#define POWER_HIGH_LOCK BIT_ULL(63) -#define POWER_LOW_LOCK BIT(31) +#define POWER_LIMIT2_MASK (0x7FFFULL<<32) +#define POWER_LIMIT2_ENABLE BIT_ULL(47) +#define POWER_LIMIT2_CLAMP BIT_ULL(48) +#define POWER_HIGH_LOCK BIT_ULL(63) +#define POWER_LOW_LOCK BIT(31) #define POWER_LIMIT4_MASK 0x1FFF -#define TIME_WINDOW1_MASK (0x7FULL<<17) -#define TIME_WINDOW2_MASK (0x7FULL<<49) +#define TIME_WINDOW1_MASK (0x7FULL<<17) +#define TIME_WINDOW2_MASK (0x7FULL<<49) -#define POWER_UNIT_OFFSET 0 -#define POWER_UNIT_MASK 0x0F +#define POWER_UNIT_OFFSET 0x00 +#define POWER_UNIT_MASK 0x0F -#define ENERGY_UNIT_OFFSET 0x08 -#define ENERGY_UNIT_MASK 0x1F00 +#define ENERGY_UNIT_OFFSET 0x08 +#define ENERGY_UNIT_MASK 0x1F00 -#define TIME_UNIT_OFFSET 0x10 -#define TIME_UNIT_MASK 0xF0000 +#define TIME_UNIT_OFFSET 0x10 +#define TIME_UNIT_MASK 0xF0000 -#define POWER_INFO_MAX_MASK (0x7fffULL<<32) -#define POWER_INFO_MIN_MASK (0x7fffULL<<16) -#define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48) -#define POWER_INFO_THERMAL_SPEC_MASK 0x7fff +#define POWER_INFO_MAX_MASK (0x7fffULL<<32) +#define POWER_INFO_MIN_MASK (0x7fffULL<<16) +#define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48) +#define POWER_INFO_THERMAL_SPEC_MASK 0x7fff -#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff -#define PP_POLICY_MASK 0x1F +#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff +#define PP_POLICY_MASK 0x1F /* * SPR has different layout for Psys Domain PowerLimit registers. * There are 17 bits of PL1 and PL2 instead of 15 bits. * The Enable bits and TimeWindow bits are also shifted as a result. */ -#define PSYS_POWER_LIMIT1_MASK 0x1FFFF -#define PSYS_POWER_LIMIT1_ENABLE BIT(17) +#define PSYS_POWER_LIMIT1_MASK 0x1FFFF +#define PSYS_POWER_LIMIT1_ENABLE BIT(17) -#define PSYS_POWER_LIMIT2_MASK (0x1FFFFULL<<32) -#define PSYS_POWER_LIMIT2_ENABLE BIT_ULL(49) +#define PSYS_POWER_LIMIT2_MASK (0x1FFFFULL<<32) +#define PSYS_POWER_LIMIT2_ENABLE BIT_ULL(49) -#define PSYS_TIME_WINDOW1_MASK (0x7FULL<<19) -#define PSYS_TIME_WINDOW2_MASK (0x7FULL<<51) +#define PSYS_TIME_WINDOW1_MASK (0x7FULL<<19) +#define PSYS_TIME_WINDOW2_MASK (0x7FULL<<51) /* bitmasks for RAPL TPMI, used by primitive access functions */ -#define TPMI_POWER_LIMIT_MASK 0x3FFFF -#define TPMI_POWER_LIMIT_ENABLE BIT_ULL(62) -#define TPMI_TIME_WINDOW_MASK (0x7FULL<<18) -#define TPMI_INFO_SPEC_MASK 0x3FFFF -#define TPMI_INFO_MIN_MASK (0x3FFFFULL << 18) -#define TPMI_INFO_MAX_MASK (0x3FFFFULL << 36) +#define TPMI_POWER_LIMIT_MASK 0x3FFFF +#define TPMI_POWER_LIMIT_ENABLE BIT_ULL(62) +#define TPMI_TIME_WINDOW_MASK (0x7FULL<<18) +#define TPMI_INFO_SPEC_MASK 0x3FFFF +#define TPMI_INFO_MIN_MASK (0x3FFFFULL << 18) +#define TPMI_INFO_MAX_MASK (0x3FFFFULL << 36) #define TPMI_INFO_MAX_TIME_WIN_MASK (0x7FULL << 54) /* Non HW constants */ -#define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ -#define RAPL_PRIMITIVE_DUMMY BIT(2) +#define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ +#define RAPL_PRIMITIVE_DUMMY BIT(2) + +#define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */ + +/* per domain data, some are optional */ +#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2) + +#define DOMAIN_STATE_INACTIVE BIT(0) +#define DOMAIN_STATE_POWER_LIMIT_SET BIT(1) + +/* Sideband MBI registers */ +#define IOSF_CPU_POWER_BUDGET_CTL_BYT 0x02 +#define IOSF_CPU_POWER_BUDGET_CTL_TNG 0xDF + +#define PACKAGE_PLN_INT_SAVED BIT(0) +#define MAX_PRIM_NAME 32 + +/* TPMI Unit register has different layout */ +#define TPMI_POWER_UNIT_OFFSET POWER_UNIT_OFFSET +#define TPMI_POWER_UNIT_MASK POWER_UNIT_MASK +#define TPMI_ENERGY_UNIT_OFFSET 0x06 +#define TPMI_ENERGY_UNIT_MASK 0x7C0 +#define TPMI_TIME_UNIT_OFFSET 0x0C +#define TPMI_TIME_UNIT_MASK 0xF000 + +#define RAPL_EVENT_MASK GENMASK(7, 0) + +#define TIME_WINDOW_MAX_MSEC 40000 +#define TIME_WINDOW_MIN_MSEC 250 -#define TIME_WINDOW_MAX_MSEC 40000 -#define TIME_WINDOW_MIN_MSEC 250 -#define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */ enum unit_type { ARBITRARY_UNIT, /* no translation */ POWER_UNIT, @@ -102,12 +127,6 @@ enum unit_type { TIME_UNIT, }; -/* per domain data, some are optional */ -#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2) - -#define DOMAIN_STATE_INACTIVE BIT(0) -#define DOMAIN_STATE_POWER_LIMIT_SET BIT(1) - static const char *pl_names[NR_POWER_LIMITS] = { [POWER_LIMIT1] = "long_term", [POWER_LIMIT2] = "short_term", @@ -222,13 +241,6 @@ static struct rapl_defaults *get_defaults(struct rapl_package *rp) return rp->priv->defaults; } -/* Sideband MBI registers */ -#define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2) -#define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf) - -#define PACKAGE_PLN_INT_SAVED BIT(0) -#define MAX_PRIM_NAME (32) - /* per domain data. used to describe individual knobs such that access function * can be consolidated into one instead of many inline functions. */ @@ -659,99 +671,104 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, /* RAPL primitives for MSR and MMIO I/F */ static struct rapl_primitive_info rpi_msr[NR_RAPL_PRIMITIVES] = { /* name, mask, shift, msr index, unit divisor */ - [POWER_LIMIT1] = PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0, - RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), - [POWER_LIMIT2] = PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32, - RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), - [POWER_LIMIT4] = PRIMITIVE_INFO_INIT(POWER_LIMIT4, POWER_LIMIT4_MASK, 0, - RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), - [ENERGY_COUNTER] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, - RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), - [FW_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [FW_HIGH_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, POWER_HIGH_LOCK, 63, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL1_ENABLE] = PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL1_CLAMP] = PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL2_ENABLE] = PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL2_CLAMP] = PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [TIME_WINDOW1] = PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17, - RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), - [TIME_WINDOW2] = PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49, - RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), - [THERMAL_SPEC_POWER] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK, - 0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MAX_POWER] = PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32, - RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MIN_POWER] = PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16, - RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MAX_TIME_WINDOW] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48, - RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), - [THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0, - RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), - [PRIORITY_LEVEL] = PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0, - RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0), - [PSYS_POWER_LIMIT1] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0, - RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), - [PSYS_POWER_LIMIT2] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, 32, - RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), - [PSYS_PL1_ENABLE] = PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, 17, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PSYS_PL2_ENABLE] = PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, 49, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PSYS_TIME_WINDOW1] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, 19, - RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), - [PSYS_TIME_WINDOW2] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51, - RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + [POWER_LIMIT1] = PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [POWER_LIMIT2] = PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [POWER_LIMIT4] = PRIMITIVE_INFO_INIT(POWER_LIMIT4, POWER_LIMIT4_MASK, 0, + RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), + [ENERGY_COUNTER] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, + RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), + [FW_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [FW_HIGH_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, POWER_HIGH_LOCK, 63, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL1_ENABLE] = PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL1_CLAMP] = PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL2_ENABLE] = PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL2_CLAMP] = PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [TIME_WINDOW1] = PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17, + RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + [TIME_WINDOW2] = PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49, + RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + [THERMAL_SPEC_POWER] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, + POWER_INFO_THERMAL_SPEC_MASK, 0, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MAX_POWER] = PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MIN_POWER] = PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MAX_TIME_WINDOW] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, + POWER_INFO_MAX_TIME_WIN_MASK, 48, + RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), + [THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, + PERF_STATUS_THROTTLE_TIME_MASK, 0, + RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), + [PRIORITY_LEVEL] = PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0, + RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0), + [PSYS_POWER_LIMIT1] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [PSYS_POWER_LIMIT2] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, + 32, RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [PSYS_PL1_ENABLE] = PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, + 17, RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, + 0), + [PSYS_PL2_ENABLE] = PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, + 49, RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, + 0), + [PSYS_TIME_WINDOW1] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, + 19, RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + [PSYS_TIME_WINDOW2] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, + 51, RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), /* non-hardware */ - [AVERAGE_POWER] = PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, - RAPL_PRIMITIVE_DERIVED), + [AVERAGE_POWER] = PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, + RAPL_PRIMITIVE_DERIVED), }; /* RAPL primitives for TPMI I/F */ static struct rapl_primitive_info rpi_tpmi[NR_RAPL_PRIMITIVES] = { /* name, mask, shift, msr index, unit divisor */ - [POWER_LIMIT1] = PRIMITIVE_INFO_INIT(POWER_LIMIT1, TPMI_POWER_LIMIT_MASK, 0, - RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), - [POWER_LIMIT2] = PRIMITIVE_INFO_INIT(POWER_LIMIT2, TPMI_POWER_LIMIT_MASK, 0, - RAPL_DOMAIN_REG_PL2, POWER_UNIT, 0), - [POWER_LIMIT4] = PRIMITIVE_INFO_INIT(POWER_LIMIT4, TPMI_POWER_LIMIT_MASK, 0, - RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), - [ENERGY_COUNTER] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, - RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), - [PL1_LOCK] = PRIMITIVE_INFO_INIT(PL1_LOCK, POWER_HIGH_LOCK, 63, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL2_LOCK] = PRIMITIVE_INFO_INIT(PL2_LOCK, POWER_HIGH_LOCK, 63, - RAPL_DOMAIN_REG_PL2, ARBITRARY_UNIT, 0), - [PL4_LOCK] = PRIMITIVE_INFO_INIT(PL4_LOCK, POWER_HIGH_LOCK, 63, - RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0), - [PL1_ENABLE] = PRIMITIVE_INFO_INIT(PL1_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL2_ENABLE] = PRIMITIVE_INFO_INIT(PL2_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, - RAPL_DOMAIN_REG_PL2, ARBITRARY_UNIT, 0), - [PL4_ENABLE] = PRIMITIVE_INFO_INIT(PL4_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, - RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0), - [TIME_WINDOW1] = PRIMITIVE_INFO_INIT(TIME_WINDOW1, TPMI_TIME_WINDOW_MASK, 18, - RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), - [TIME_WINDOW2] = PRIMITIVE_INFO_INIT(TIME_WINDOW2, TPMI_TIME_WINDOW_MASK, 18, - RAPL_DOMAIN_REG_PL2, TIME_UNIT, 0), - [THERMAL_SPEC_POWER] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, TPMI_INFO_SPEC_MASK, 0, - RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MAX_POWER] = PRIMITIVE_INFO_INIT(MAX_POWER, TPMI_INFO_MAX_MASK, 36, - RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MIN_POWER] = PRIMITIVE_INFO_INIT(MIN_POWER, TPMI_INFO_MIN_MASK, 18, - RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MAX_TIME_WINDOW] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, TPMI_INFO_MAX_TIME_WIN_MASK, 54, - RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), - [THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0, - RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), + [POWER_LIMIT1] = PRIMITIVE_INFO_INIT(POWER_LIMIT1, TPMI_POWER_LIMIT_MASK, 0, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [POWER_LIMIT2] = PRIMITIVE_INFO_INIT(POWER_LIMIT2, TPMI_POWER_LIMIT_MASK, 0, + RAPL_DOMAIN_REG_PL2, POWER_UNIT, 0), + [POWER_LIMIT4] = PRIMITIVE_INFO_INIT(POWER_LIMIT4, TPMI_POWER_LIMIT_MASK, 0, + RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), + [ENERGY_COUNTER] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, + RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), + [PL1_LOCK] = PRIMITIVE_INFO_INIT(PL1_LOCK, POWER_HIGH_LOCK, 63, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL2_LOCK] = PRIMITIVE_INFO_INIT(PL2_LOCK, POWER_HIGH_LOCK, 63, + RAPL_DOMAIN_REG_PL2, ARBITRARY_UNIT, 0), + [PL4_LOCK] = PRIMITIVE_INFO_INIT(PL4_LOCK, POWER_HIGH_LOCK, 63, + RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0), + [PL1_ENABLE] = PRIMITIVE_INFO_INIT(PL1_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL2_ENABLE] = PRIMITIVE_INFO_INIT(PL2_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, + RAPL_DOMAIN_REG_PL2, ARBITRARY_UNIT, 0), + [PL4_ENABLE] = PRIMITIVE_INFO_INIT(PL4_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, + RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0), + [TIME_WINDOW1] = PRIMITIVE_INFO_INIT(TIME_WINDOW1, TPMI_TIME_WINDOW_MASK, 18, + RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + [TIME_WINDOW2] = PRIMITIVE_INFO_INIT(TIME_WINDOW2, TPMI_TIME_WINDOW_MASK, 18, + RAPL_DOMAIN_REG_PL2, TIME_UNIT, 0), + [THERMAL_SPEC_POWER] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, TPMI_INFO_SPEC_MASK, 0, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MAX_POWER] = PRIMITIVE_INFO_INIT(MAX_POWER, TPMI_INFO_MAX_MASK, 36, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MIN_POWER] = PRIMITIVE_INFO_INIT(MIN_POWER, TPMI_INFO_MIN_MASK, 18, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MAX_TIME_WINDOW] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, TPMI_INFO_MAX_TIME_WIN_MASK, + 54, RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), + [THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, + 0, RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), /* non-hardware */ - [AVERAGE_POWER] = PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, - POWER_UNIT, RAPL_PRIMITIVE_DERIVED), + [AVERAGE_POWER] = PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, + RAPL_PRIMITIVE_DERIVED), }; static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim) @@ -1143,14 +1160,6 @@ static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value, return value; } -/* TPMI Unit register has different layout */ -#define TPMI_POWER_UNIT_OFFSET POWER_UNIT_OFFSET -#define TPMI_POWER_UNIT_MASK POWER_UNIT_MASK -#define TPMI_ENERGY_UNIT_OFFSET 0x06 -#define TPMI_ENERGY_UNIT_MASK 0x7C0 -#define TPMI_TIME_UNIT_OFFSET 0x0C -#define TPMI_TIME_UNIT_MASK 0xF000 - static int rapl_check_unit_tpmi(struct rapl_domain *rd) { struct reg_action ra; @@ -1241,77 +1250,77 @@ static const struct rapl_defaults rapl_defaults_amd = { }; static const struct x86_cpu_id rapl_ids[] __initconst = { - X86_MATCH_VFM(INTEL_SANDYBRIDGE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &rapl_defaults_core), - - X86_MATCH_VFM(INTEL_IVYBRIDGE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &rapl_defaults_core), - - X86_MATCH_VFM(INTEL_HASWELL, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_HASWELL_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_HASWELL_G, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_HASWELL_X, &rapl_defaults_hsw_server), - - X86_MATCH_VFM(INTEL_BROADWELL, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_BROADWELL_G, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_BROADWELL_D, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_BROADWELL_X, &rapl_defaults_hsw_server), - - X86_MATCH_VFM(INTEL_SKYLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_SKYLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_SKYLAKE_X, &rapl_defaults_hsw_server), - X86_MATCH_VFM(INTEL_KABYLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_KABYLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_CANNONLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ICELAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ICELAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ICELAKE_X, &rapl_defaults_hsw_server), - X86_MATCH_VFM(INTEL_ICELAKE_D, &rapl_defaults_hsw_server), - X86_MATCH_VFM(INTEL_COMETLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_COMETLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_TIGERLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_TIGERLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ROCKETLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ALDERLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ALDERLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_BARTLETTLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), - X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server), - X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_NOVALAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_NOVALAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_LAKEFIELD, &rapl_defaults_core), - - X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &rapl_defaults_cht), - X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &rapl_defaults_tng), - X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2,&rapl_defaults_ann), - X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_TREMONT, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &rapl_defaults_core), - - X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &rapl_defaults_hsw_server), - X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &rapl_defaults_hsw_server), - - X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd), - X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd), - X86_MATCH_VENDOR_FAM(AMD, 0x1A, &rapl_defaults_amd), - X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd), + X86_MATCH_VFM(INTEL_SANDYBRIDGE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_IVYBRIDGE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_HASWELL, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_HASWELL_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_HASWELL_G, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_HASWELL_X, &rapl_defaults_hsw_server), + + X86_MATCH_VFM(INTEL_BROADWELL, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BROADWELL_G, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BROADWELL_D, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BROADWELL_X, &rapl_defaults_hsw_server), + + X86_MATCH_VFM(INTEL_SKYLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SKYLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SKYLAKE_X, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_KABYLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_KABYLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_CANNONLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE_X, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_ICELAKE_D, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_COMETLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_COMETLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_TIGERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_TIGERLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ROCKETLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ALDERLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BARTLETTLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server), + X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_NOVALAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_NOVALAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_LAKEFIELD, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &rapl_defaults_cht), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &rapl_defaults_tng), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &rapl_defaults_ann), + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_TREMONT, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &rapl_defaults_hsw_server), + + X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd), + X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd), + X86_MATCH_VENDOR_FAM(AMD, 0x1A, &rapl_defaults_amd), + X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd), {} }; MODULE_DEVICE_TABLE(x86cpu, rapl_ids); @@ -1777,7 +1786,6 @@ enum perf_rapl_events { PERF_RAPL_PSYS, /* psys */ PERF_RAPL_MAX }; -#define RAPL_EVENT_MASK GENMASK(7, 0) static const int event_to_domain[PERF_RAPL_MAX] = { [PERF_RAPL_PP0] = RAPL_DOMAIN_PP0, -- cgit v1.2.3 From c64e89ba9fa3d603142a4c5746cf148dc569574e Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:35 -0800 Subject: powercap: intel_rapl: Remove unused TIME_WINDOW macros Remove TIME_WINDOW_MIN_MSEC and TIME_WINDOW_MAX_MSEC as they are not used anywhere in the code. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-4-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 207825246724..3bdd21d20468 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -117,9 +117,6 @@ #define RAPL_EVENT_MASK GENMASK(7, 0) -#define TIME_WINDOW_MAX_MSEC 40000 -#define TIME_WINDOW_MIN_MSEC 250 - enum unit_type { ARBITRARY_UNIT, /* no translation */ POWER_UNIT, -- cgit v1.2.3 From 637bf7404e046ab72f7019a13bbaa9a0368d2d1d Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:36 -0800 Subject: powercap: intel_rapl: Simplify rapl_compute_time_window_atom() Restructure to use early return for to_raw case, eliminating redundant assignments and clarifying conversion paths. No functional changes. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-5-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 3bdd21d20468..2a77a0ee239b 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1145,16 +1145,14 @@ static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value, static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value, bool to_raw) { + if (to_raw) + return div64_u64(value, rd->time_unit); + /* * Atom time unit encoding is straight forward val * time_unit, * where time_unit is default to 1 sec. Never 0. */ - if (!to_raw) - return (value) ? value * rd->time_unit : rd->time_unit; - - value = div64_u64(value, rd->time_unit); - - return value; + return (value) ? value * rd->time_unit : rd->time_unit; } static int rapl_check_unit_tpmi(struct rapl_domain *rd) -- cgit v1.2.3 From 923860a899a5e343fa0313c9f2f32d67c254f1ac Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:37 -0800 Subject: powercap: intel_rapl: Use shifts for power-of-2 operations Replace division by (1 << value) with shift operations for clarity and consistency. Add ULL suffix to avoid undefined behavior from shifting signed integers. No functional changes intended. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-6-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 2a77a0ee239b..2c30d791e443 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -964,13 +964,13 @@ static int rapl_check_unit_core(struct rapl_domain *rd) } value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; - rd->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value); + rd->energy_unit = (ENERGY_UNIT_SCALE * 1000000) >> value; value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; - rd->power_unit = 1000000 / (1 << value); + rd->power_unit = 1000000 >> value; value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; - rd->time_unit = 1000000 / (1 << value); + rd->time_unit = 1000000 >> value; pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); @@ -992,13 +992,13 @@ static int rapl_check_unit_atom(struct rapl_domain *rd) } value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; - rd->energy_unit = ENERGY_UNIT_SCALE * 1 << value; + rd->energy_unit = ENERGY_UNIT_SCALE * (1ULL << value); value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; - rd->power_unit = (1 << value) * 1000; + rd->power_unit = (1ULL << value) * 1000; value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; - rd->time_unit = 1000000 / (1 << value); + rd->time_unit = 1000000 >> value; pr_debug("Atom %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); @@ -1121,7 +1121,7 @@ static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value, if (!to_raw) { f = (value & 0x60) >> 5; y = value & 0x1f; - value = (1 << y) * (4 + f) * rd->time_unit / 4; + value = (1ULL << y) * (4 + f) * rd->time_unit / 4; } else { if (value < rd->time_unit) return 0; @@ -1169,13 +1169,13 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) } value = (ra.value & TPMI_ENERGY_UNIT_MASK) >> TPMI_ENERGY_UNIT_OFFSET; - rd->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value); + rd->energy_unit = (ENERGY_UNIT_SCALE * 1000000) >> value; value = (ra.value & TPMI_POWER_UNIT_MASK) >> TPMI_POWER_UNIT_OFFSET; - rd->power_unit = 1000000 / (1 << value); + rd->power_unit = 1000000 >> value; value = (ra.value & TPMI_TIME_UNIT_MASK) >> TPMI_TIME_UNIT_OFFSET; - rd->time_unit = 1000000 / (1 << value); + rd->time_unit = 1000000 >> value; pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); -- cgit v1.2.3 From 71bb2c50e0248217dd3999621b06bd69c31e6e90 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:38 -0800 Subject: powercap: intel_rapl: Use GENMASK() and BIT() macros Replace hardcoded bitmasks and bit shift operations with standard GENMASK(), GENMASK_ULL(), BIT(), and BIT_ULL() macros for better readability and to follow kernel coding conventions. No functional changes. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-7-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 60 ++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 2c30d791e443..8dde3b0eb454 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -31,62 +31,62 @@ #include /* bitmasks for RAPL MSRs, used by primitive access functions */ -#define ENERGY_STATUS_MASK 0xffffffff +#define ENERGY_STATUS_MASK GENMASK(31, 0) -#define POWER_LIMIT1_MASK 0x7FFF +#define POWER_LIMIT1_MASK GENMASK(14, 0) #define POWER_LIMIT1_ENABLE BIT(15) #define POWER_LIMIT1_CLAMP BIT(16) -#define POWER_LIMIT2_MASK (0x7FFFULL<<32) +#define POWER_LIMIT2_MASK GENMASK_ULL(46, 32) #define POWER_LIMIT2_ENABLE BIT_ULL(47) #define POWER_LIMIT2_CLAMP BIT_ULL(48) #define POWER_HIGH_LOCK BIT_ULL(63) #define POWER_LOW_LOCK BIT(31) -#define POWER_LIMIT4_MASK 0x1FFF +#define POWER_LIMIT4_MASK GENMASK(12, 0) -#define TIME_WINDOW1_MASK (0x7FULL<<17) -#define TIME_WINDOW2_MASK (0x7FULL<<49) +#define TIME_WINDOW1_MASK GENMASK_ULL(23, 17) +#define TIME_WINDOW2_MASK GENMASK_ULL(55, 49) #define POWER_UNIT_OFFSET 0x00 -#define POWER_UNIT_MASK 0x0F +#define POWER_UNIT_MASK GENMASK(3, 0) #define ENERGY_UNIT_OFFSET 0x08 -#define ENERGY_UNIT_MASK 0x1F00 +#define ENERGY_UNIT_MASK GENMASK(12, 8) #define TIME_UNIT_OFFSET 0x10 -#define TIME_UNIT_MASK 0xF0000 +#define TIME_UNIT_MASK GENMASK(19, 16) -#define POWER_INFO_MAX_MASK (0x7fffULL<<32) -#define POWER_INFO_MIN_MASK (0x7fffULL<<16) -#define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48) -#define POWER_INFO_THERMAL_SPEC_MASK 0x7fff +#define POWER_INFO_MAX_MASK GENMASK_ULL(46, 32) +#define POWER_INFO_MIN_MASK GENMASK_ULL(30, 16) +#define POWER_INFO_MAX_TIME_WIN_MASK GENMASK_ULL(53, 48) +#define POWER_INFO_THERMAL_SPEC_MASK GENMASK(14, 0) -#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff -#define PP_POLICY_MASK 0x1F +#define PERF_STATUS_THROTTLE_TIME_MASK GENMASK(31, 0) +#define PP_POLICY_MASK GENMASK(4, 0) /* * SPR has different layout for Psys Domain PowerLimit registers. * There are 17 bits of PL1 and PL2 instead of 15 bits. * The Enable bits and TimeWindow bits are also shifted as a result. */ -#define PSYS_POWER_LIMIT1_MASK 0x1FFFF +#define PSYS_POWER_LIMIT1_MASK GENMASK_ULL(16, 0) #define PSYS_POWER_LIMIT1_ENABLE BIT(17) -#define PSYS_POWER_LIMIT2_MASK (0x1FFFFULL<<32) +#define PSYS_POWER_LIMIT2_MASK GENMASK_ULL(48, 32) #define PSYS_POWER_LIMIT2_ENABLE BIT_ULL(49) -#define PSYS_TIME_WINDOW1_MASK (0x7FULL<<19) -#define PSYS_TIME_WINDOW2_MASK (0x7FULL<<51) +#define PSYS_TIME_WINDOW1_MASK GENMASK_ULL(25, 19) +#define PSYS_TIME_WINDOW2_MASK GENMASK_ULL(57, 51) /* bitmasks for RAPL TPMI, used by primitive access functions */ -#define TPMI_POWER_LIMIT_MASK 0x3FFFF +#define TPMI_POWER_LIMIT_MASK GENMASK_ULL(17, 0) #define TPMI_POWER_LIMIT_ENABLE BIT_ULL(62) -#define TPMI_TIME_WINDOW_MASK (0x7FULL<<18) -#define TPMI_INFO_SPEC_MASK 0x3FFFF -#define TPMI_INFO_MIN_MASK (0x3FFFFULL << 18) -#define TPMI_INFO_MAX_MASK (0x3FFFFULL << 36) -#define TPMI_INFO_MAX_TIME_WIN_MASK (0x7FULL << 54) +#define TPMI_TIME_WINDOW_MASK GENMASK_ULL(24, 18) +#define TPMI_INFO_SPEC_MASK GENMASK_ULL(17, 0) +#define TPMI_INFO_MIN_MASK GENMASK_ULL(35, 18) +#define TPMI_INFO_MAX_MASK GENMASK_ULL(53, 36) +#define TPMI_INFO_MAX_TIME_WIN_MASK GENMASK_ULL(60, 54) /* Non HW constants */ #define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ @@ -111,9 +111,9 @@ #define TPMI_POWER_UNIT_OFFSET POWER_UNIT_OFFSET #define TPMI_POWER_UNIT_MASK POWER_UNIT_MASK #define TPMI_ENERGY_UNIT_OFFSET 0x06 -#define TPMI_ENERGY_UNIT_MASK 0x7C0 +#define TPMI_ENERGY_UNIT_MASK GENMASK_ULL(10, 6) #define TPMI_TIME_UNIT_OFFSET 0x0C -#define TPMI_TIME_UNIT_MASK 0xF000 +#define TPMI_TIME_UNIT_MASK GENMASK_ULL(15, 12) #define RAPL_EVENT_MASK GENMASK(7, 0) @@ -1102,8 +1102,8 @@ static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) &power_ctrl_orig_val); mdata = power_ctrl_orig_val; if (enable) { - mdata &= ~(0x7f << 8); - mdata |= 1 << 8; + mdata &= ~GENMASK(14, 8); + mdata |= BIT(8); } iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE, defaults->floor_freq_reg_addr, mdata); @@ -1136,7 +1136,7 @@ static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value, if (y > 0x1f) return 0x7f; - f = div64_u64(4 * (value - (1ULL << y)), 1ULL << y); + f = div64_u64(4 * (value - BIT_ULL(y)), BIT_ULL(y)); value = (y & 0x1f) | ((f & 0x3) << 5); } return value; -- cgit v1.2.3 From 90503f9ffee927c3abdc94a4862d13ae71ea9442 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:39 -0800 Subject: powercap: intel_rapl: Use unit conversion macros from units.h Replace hardcoded numeric constants with standard unit conversion macros from linux/units.h for better code clarity and self-documentation. Add MICROJOULE_PER_JOULE and NANOJOULE_PER_JOULE to units.h to support energy unit conversions, following the existing pattern for power units. No functional changes. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-8-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 19 ++++++++++--------- include/linux/units.h | 3 +++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 8dde3b0eb454..380893baf987 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -964,13 +965,13 @@ static int rapl_check_unit_core(struct rapl_domain *rd) } value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; - rd->energy_unit = (ENERGY_UNIT_SCALE * 1000000) >> value; + rd->energy_unit = (ENERGY_UNIT_SCALE * MICROJOULE_PER_JOULE) >> value; value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; - rd->power_unit = 1000000 >> value; + rd->power_unit = MICROWATT_PER_WATT >> value; value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; - rd->time_unit = 1000000 >> value; + rd->time_unit = USEC_PER_SEC >> value; pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); @@ -995,10 +996,10 @@ static int rapl_check_unit_atom(struct rapl_domain *rd) rd->energy_unit = ENERGY_UNIT_SCALE * (1ULL << value); value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; - rd->power_unit = (1ULL << value) * 1000; + rd->power_unit = (1ULL << value) * MILLIWATT_PER_WATT; value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; - rd->time_unit = 1000000 >> value; + rd->time_unit = USEC_PER_SEC >> value; pr_debug("Atom %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); @@ -1169,13 +1170,13 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) } value = (ra.value & TPMI_ENERGY_UNIT_MASK) >> TPMI_ENERGY_UNIT_OFFSET; - rd->energy_unit = (ENERGY_UNIT_SCALE * 1000000) >> value; + rd->energy_unit = (ENERGY_UNIT_SCALE * MICROJOULE_PER_JOULE) >> value; value = (ra.value & TPMI_POWER_UNIT_MASK) >> TPMI_POWER_UNIT_OFFSET; - rd->power_unit = 1000000 >> value; + rd->power_unit = MICROWATT_PER_WATT >> value; value = (ra.value & TPMI_TIME_UNIT_MASK) >> TPMI_TIME_UNIT_OFFSET; - rd->time_unit = 1000000 >> value; + rd->time_unit = USEC_PER_SEC >> value; pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); @@ -1208,7 +1209,7 @@ static const struct rapl_defaults rapl_defaults_spr_server = { .check_unit = rapl_check_unit_core, .set_floor_freq = set_floor_freq_default, .compute_time_window = rapl_compute_time_window_core, - .psys_domain_energy_unit = 1000000000, + .psys_domain_energy_unit = NANOJOULE_PER_JOULE, .spr_psys_bits = true, }; diff --git a/include/linux/units.h b/include/linux/units.h index 80d57c50b9e3..c6d78988613a 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -57,6 +57,9 @@ #define MICROWATT_PER_MILLIWATT 1000UL #define MICROWATT_PER_WATT 1000000UL +#define MICROJOULE_PER_JOULE 1000000UL +#define NANOJOULE_PER_JOULE 1000000000UL + #define BYTES_PER_KBIT (KILO / BITS_PER_BYTE) #define BYTES_PER_MBIT (MEGA / BITS_PER_BYTE) #define BYTES_PER_GBIT (GIGA / BITS_PER_BYTE) -- cgit v1.2.3 From d7ca7d1488cc916dbf0a6a594abbda81d4eaeee9 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:40 -0800 Subject: powercap: intel_rapl: Allow interface drivers to configure rapl_defaults RAPL default settings vary across different RAPL interfaces (MSR, TPMI, MMIO). Currently, these defaults are stored in the common RAPL driver, which requires interface-specific handling logic and makes the common layer unnecessarily complex. There is no strong reason for the common code to own these defaults, since they are inherently interface-specific. To prepare for moving default configuration into the individual interface drivers, 1. Move struct rapl_defaults into a shared header so that interface drivers can directly populate their own default settings. 2. Change the @defaults field in struct rapl_if_priv from void * to const struct rapl_defaults * to improve type safety and readability and update the common driver to use the typed defaults structure. 3. Update all internal getter functions and local pointers to use const struct rapl_defaults * to maintain const-correctness. 4. Rename and export the common helper functions (check_unit, set_floor_freq, compute_time_window) so interface drivers may reuse or override them as appropriate. No functional changes. This is a preparatory refactoring to allow interface drivers to supply their own RAPL default settings. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-9-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 64 ++++++++++++++++-------------------- include/linux/intel_rapl.h | 17 ++++++++-- 2 files changed, 43 insertions(+), 38 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 380893baf987..7c95eb658c16 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -221,20 +221,10 @@ static int get_pl_prim(struct rapl_domain *rd, int pl, enum pl_prims prim) #define power_zone_to_rapl_domain(_zone) \ container_of(_zone, struct rapl_domain, power_zone) -struct rapl_defaults { - u8 floor_freq_reg_addr; - int (*check_unit)(struct rapl_domain *rd); - void (*set_floor_freq)(struct rapl_domain *rd, bool mode); - u64 (*compute_time_window)(struct rapl_domain *rd, u64 val, - bool to_raw); - unsigned int dram_domain_energy_unit; - unsigned int psys_domain_energy_unit; - bool spr_psys_bits; -}; -static struct rapl_defaults *defaults_msr; +static const struct rapl_defaults *defaults_msr; static const struct rapl_defaults defaults_tpmi; -static struct rapl_defaults *get_defaults(struct rapl_package *rp) +static const struct rapl_defaults *get_defaults(struct rapl_package *rp) { return rp->priv->defaults; } @@ -351,7 +341,7 @@ static int find_nr_power_limit(struct rapl_domain *rd) static int set_domain_enable(struct powercap_zone *power_zone, bool mode) { struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u64 val; int ret; @@ -640,7 +630,7 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, u64 value, int to_raw) { u64 units = 1; - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u64 scale = 1; switch (type) { @@ -785,11 +775,11 @@ static int rapl_config(struct rapl_package *rp) /* MMIO I/F shares the same register layout as MSR registers */ case RAPL_IF_MMIO: case RAPL_IF_MSR: - rp->priv->defaults = (void *)defaults_msr; + rp->priv->defaults = defaults_msr; rp->priv->rpi = (void *)rpi_msr; break; case RAPL_IF_TPMI: - rp->priv->defaults = (void *)&defaults_tpmi; + rp->priv->defaults = &defaults_tpmi; rp->priv->rpi = (void *)rpi_tpmi; break; default: @@ -806,7 +796,7 @@ static int rapl_config(struct rapl_package *rp) static enum rapl_primitives prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim) { - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); if (!defaults->spr_psys_bits) return prim; @@ -951,7 +941,7 @@ static int rapl_write_pl_data(struct rapl_domain *rd, int pl, * power unit : microWatts : Represented in milliWatts by default * time unit : microseconds: Represented in seconds by default */ -static int rapl_check_unit_core(struct rapl_domain *rd) +int rapl_default_check_unit(struct rapl_domain *rd) { struct reg_action ra; u32 value; @@ -978,6 +968,7 @@ static int rapl_check_unit_core(struct rapl_domain *rd) return 0; } +EXPORT_SYMBOL_NS_GPL(rapl_default_check_unit, "INTEL_RAPL"); static int rapl_check_unit_atom(struct rapl_domain *rd) { @@ -1071,7 +1062,7 @@ static void package_power_limit_irq_restore(struct rapl_package *rp) wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); } -static void set_floor_freq_default(struct rapl_domain *rd, bool mode) +void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode) { int i; @@ -1085,11 +1076,12 @@ static void set_floor_freq_default(struct rapl_domain *rd, bool mode) rapl_write_pl_data(rd, i, PL_CLAMP, mode); } } +EXPORT_SYMBOL_NS_GPL(rapl_default_set_floor_freq, "INTEL_RAPL"); static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) { static u32 power_ctrl_orig_val; - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u32 mdata; if (!defaults->floor_freq_reg_addr) { @@ -1110,8 +1102,7 @@ static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) defaults->floor_freq_reg_addr, mdata); } -static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value, - bool to_raw) +u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw) { u64 f, y; /* fraction and exp. used for time unit */ @@ -1142,6 +1133,7 @@ static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value, } return value; } +EXPORT_SYMBOL_NS_GPL(rapl_default_compute_time_window, "INTEL_RAPL"); static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value, bool to_raw) @@ -1187,28 +1179,28 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) static const struct rapl_defaults defaults_tpmi = { .check_unit = rapl_check_unit_tpmi, /* Reuse existing logic, ignore the PL_CLAMP failures and enable all Power Limits */ - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, }; static const struct rapl_defaults rapl_defaults_core = { .floor_freq_reg_addr = 0, - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, }; static const struct rapl_defaults rapl_defaults_hsw_server = { - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, .dram_domain_energy_unit = 15300, }; static const struct rapl_defaults rapl_defaults_spr_server = { - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, .psys_domain_energy_unit = NANOJOULE_PER_JOULE, .spr_psys_bits = true, }; @@ -1242,7 +1234,7 @@ static const struct rapl_defaults rapl_defaults_cht = { }; static const struct rapl_defaults rapl_defaults_amd = { - .check_unit = rapl_check_unit_core, + .check_unit = rapl_default_check_unit, }; static const struct x86_cpu_id rapl_ids[] __initconst = { @@ -1448,7 +1440,7 @@ static int rapl_check_domain(int domain, struct rapl_package *rp) */ static int rapl_get_domain_unit(struct rapl_domain *rd) { - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); int ret; if (!rd->regs[RAPL_DOMAIN_REG_UNIT].val) { @@ -2341,7 +2333,7 @@ static int __init rapl_init(void) id = x86_match_cpu(rapl_ids); if (id) { - defaults_msr = (struct rapl_defaults *)id->driver_data; + defaults_msr = (const struct rapl_defaults *)id->driver_data; rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0); if (!rapl_msr_platdev) diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index fa1f328d6712..6d694099a3ad 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -128,6 +128,16 @@ struct reg_action { int err; }; +struct rapl_defaults { + u8 floor_freq_reg_addr; + int (*check_unit)(struct rapl_domain *rd); + void (*set_floor_freq)(struct rapl_domain *rd, bool mode); + u64 (*compute_time_window)(struct rapl_domain *rd, u64 val, bool to_raw); + unsigned int dram_domain_energy_unit; + unsigned int psys_domain_energy_unit; + bool spr_psys_bits; +}; + /** * struct rapl_if_priv: private data for different RAPL interfaces * @control_type: Each RAPL interface must have its own powercap @@ -142,7 +152,7 @@ struct reg_action { * registers. * @write_raw: Callback for writing RAPL interface specific * registers. - * @defaults: internal pointer to interface default settings + * @defaults: pointer to default settings * @rpi: internal pointer to interface primitive info */ struct rapl_if_priv { @@ -154,7 +164,7 @@ struct rapl_if_priv { int limits[RAPL_DOMAIN_MAX]; int (*read_raw)(int id, struct reg_action *ra, bool pmu_ctx); int (*write_raw)(int id, struct reg_action *ra); - void *defaults; + const struct rapl_defaults *defaults; void *rpi; }; @@ -211,6 +221,9 @@ void rapl_remove_package_cpuslocked(struct rapl_package *rp); struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu); struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu); void rapl_remove_package(struct rapl_package *rp); +int rapl_default_check_unit(struct rapl_domain *rd); +void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode); +u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw); #ifdef CONFIG_PERF_EVENTS int rapl_package_add_pmu(struct rapl_package *rp); -- cgit v1.2.3 From 22e729c5e5e0c5a35ae9fbc691bb7635dae3378b Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:41 -0800 Subject: powercap: intel_rapl: Move TPMI default settings into TPMI interface driver TPMI-specific RAPL defaults differ from those used by MSR and MMIO interfaces. Keeping them in RAPL common driver introduces unnecessary complexity. Move the TPMI defaults into the TPMI interface driver. This change includes the following updates: 1. Add a TPMI-local struct rapl_defaults instance and assign it to priv->defaults during TPMI probe. 2. Move rapl_check_unit_tpmi() and related unit-field definitions from the common driver into the TPMI driver. 3. In rapl_check_unit_tpmi(), replace the generic get_rid() usage with direct access to the TPMI package ID, since the function is now interface-specific. No functional changes are intended. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-10-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 45 ---------------------------------- drivers/powercap/intel_rapl_tpmi.c | 47 ++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 45 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 7c95eb658c16..6a2153039f73 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -108,14 +108,6 @@ #define PACKAGE_PLN_INT_SAVED BIT(0) #define MAX_PRIM_NAME 32 -/* TPMI Unit register has different layout */ -#define TPMI_POWER_UNIT_OFFSET POWER_UNIT_OFFSET -#define TPMI_POWER_UNIT_MASK POWER_UNIT_MASK -#define TPMI_ENERGY_UNIT_OFFSET 0x06 -#define TPMI_ENERGY_UNIT_MASK GENMASK_ULL(10, 6) -#define TPMI_TIME_UNIT_OFFSET 0x0C -#define TPMI_TIME_UNIT_MASK GENMASK_ULL(15, 12) - #define RAPL_EVENT_MASK GENMASK(7, 0) enum unit_type { @@ -222,7 +214,6 @@ static int get_pl_prim(struct rapl_domain *rd, int pl, enum pl_prims prim) container_of(_zone, struct rapl_domain, power_zone) static const struct rapl_defaults *defaults_msr; -static const struct rapl_defaults defaults_tpmi; static const struct rapl_defaults *get_defaults(struct rapl_package *rp) { @@ -779,7 +770,6 @@ static int rapl_config(struct rapl_package *rp) rp->priv->rpi = (void *)rpi_msr; break; case RAPL_IF_TPMI: - rp->priv->defaults = &defaults_tpmi; rp->priv->rpi = (void *)rpi_tpmi; break; default: @@ -1148,41 +1138,6 @@ static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value, return (value) ? value * rd->time_unit : rd->time_unit; } -static int rapl_check_unit_tpmi(struct rapl_domain *rd) -{ - struct reg_action ra; - u32 value; - - ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; - ra.mask = ~0; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { - pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", - ra.reg.val, rd->rp->name, rd->name); - return -ENODEV; - } - - value = (ra.value & TPMI_ENERGY_UNIT_MASK) >> TPMI_ENERGY_UNIT_OFFSET; - rd->energy_unit = (ENERGY_UNIT_SCALE * MICROJOULE_PER_JOULE) >> value; - - value = (ra.value & TPMI_POWER_UNIT_MASK) >> TPMI_POWER_UNIT_OFFSET; - rd->power_unit = MICROWATT_PER_WATT >> value; - - value = (ra.value & TPMI_TIME_UNIT_MASK) >> TPMI_TIME_UNIT_OFFSET; - rd->time_unit = USEC_PER_SEC >> value; - - pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n", - rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); - - return 0; -} - -static const struct rapl_defaults defaults_tpmi = { - .check_unit = rapl_check_unit_tpmi, - /* Reuse existing logic, ignore the PL_CLAMP failures and enable all Power Limits */ - .set_floor_freq = rapl_default_set_floor_freq, - .compute_time_window = rapl_default_compute_time_window, -}; - static const struct rapl_defaults rapl_defaults_core = { .floor_freq_reg_addr = 0, .check_unit = rapl_default_check_unit, diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index 1fa821376dde..c06d687366fc 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -9,12 +9,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include #include #include #include +#include #define TPMI_RAPL_MAJOR_VERSION 0 #define TPMI_RAPL_MINOR_VERSION 1 @@ -250,6 +252,50 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) return 0; } +/* TPMI Unit register has different layout */ +#define TPMI_ENERGY_UNIT_SCALE 1000 +#define TPMI_POWER_UNIT_OFFSET 0x00 +#define TPMI_POWER_UNIT_MASK GENMASK(3, 0) +#define TPMI_ENERGY_UNIT_OFFSET 0x06 +#define TPMI_ENERGY_UNIT_MASK GENMASK_ULL(10, 6) +#define TPMI_TIME_UNIT_OFFSET 0x0C +#define TPMI_TIME_UNIT_MASK GENMASK_ULL(15, 12) + +static int rapl_check_unit_tpmi(struct rapl_domain *rd) +{ + struct reg_action ra; + u32 value; + + ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; + ra.mask = ~0; + if (tpmi_rapl_read_raw(rd->rp->id, &ra, false)) { + pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", + ra.reg.val, rd->rp->name, rd->name); + return -ENODEV; + } + + value = (ra.value & TPMI_ENERGY_UNIT_MASK) >> TPMI_ENERGY_UNIT_OFFSET; + rd->energy_unit = (TPMI_ENERGY_UNIT_SCALE * MICROJOULE_PER_JOULE) >> value; + + value = (ra.value & TPMI_POWER_UNIT_MASK) >> TPMI_POWER_UNIT_OFFSET; + rd->power_unit = MICROWATT_PER_WATT >> value; + + value = (ra.value & TPMI_TIME_UNIT_MASK) >> TPMI_TIME_UNIT_OFFSET; + rd->time_unit = USEC_PER_SEC >> value; + + pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n", + rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); + + return 0; +} + +static const struct rapl_defaults defaults_tpmi = { + .check_unit = rapl_check_unit_tpmi, + /* Reuse existing logic, ignore the PL_CLAMP failures and enable all Power Limits */ + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, +}; + static int intel_rapl_tpmi_probe(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id) { @@ -297,6 +343,7 @@ static int intel_rapl_tpmi_probe(struct auxiliary_device *auxdev, trp->priv.read_raw = tpmi_rapl_read_raw; trp->priv.write_raw = tpmi_rapl_write_raw; trp->priv.control_type = tpmi_control_type; + trp->priv.defaults = &defaults_tpmi; /* RAPL TPMI I/F is per physical package */ trp->rp = rapl_find_package_domain(info->package_id, &trp->priv, false); -- cgit v1.2.3 From f82fabe18c75a7703e4bd81a9eabc9aa2303e1fb Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:42 -0800 Subject: thermal: intel: int340x: processor: Move RAPL defaults to MMIO driver Previously, the MMIO and MSR RAPL interfaces shared the same set of RAPL defaults provided by common code. However, unlike the MSR interface, the MMIO RAPL interface does not require CPU-specific variations in its default handling. Keeping the RAPL defaults in the RAPL common driver therefore provides no additional benefit. Move the MMIO defaults into the MMIO interface driver. This change includes the following updates: * Introduce a MMIO-local rapl_defaults instance with the appropriate default callbacks. * Assign the MMIO-specific rapl_defaults to priv->defaults during MMIO driver initialization. No functional changes are expected. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-11-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c index e56b18aeda71..5dbeb0a43c8c 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c @@ -19,6 +19,13 @@ static const struct rapl_mmio_regs rapl_mmio_default = { .limits[RAPL_DOMAIN_DRAM] = BIT(POWER_LIMIT2), }; +static const struct rapl_defaults rapl_defaults_mmio = { + .floor_freq_reg_addr = 0, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, +}; + static int rapl_mmio_read_raw(int cpu, struct reg_action *ra, bool atomic) { if (!ra->reg.mmio) @@ -67,6 +74,7 @@ int proc_thermal_rapl_add(struct pci_dev *pdev, struct proc_thermal_device *proc rapl_mmio_priv.read_raw = rapl_mmio_read_raw; rapl_mmio_priv.write_raw = rapl_mmio_write_raw; + rapl_mmio_priv.defaults = &rapl_defaults_mmio; rapl_mmio_priv.control_type = powercap_register_control_type(NULL, "intel-rapl-mmio", NULL); if (IS_ERR(rapl_mmio_priv.control_type)) { -- cgit v1.2.3 From 6acae3c8332f3125e331f2d896bef994d4460376 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Thu, 19 Feb 2026 19:15:55 +0100 Subject: cpufreq: intel_pstate: Allow repeated intel_pstate disable Repeated intel_pstate disables currently return an error, adding unnecessary complexity to userspace scripts which must first read the current state and conditionally write 'off'. Make repeated intel_pstate disables a no-op. Signed-off-by: Fabio M. De Francesco Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260219181600.16388-1-fabio.m.de.francesco@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 11c58af41900..51938c5a47ca 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3472,7 +3472,7 @@ static int intel_pstate_update_status(const char *buf, size_t size) { if (size == 3 && !strncmp(buf, "off", size)) { if (!intel_pstate_driver) - return -EINVAL; + return 0; if (hwp_active) return -EBUSY; -- cgit v1.2.3 From 022eec206a32c985f096ccc79894dad50dcf04f6 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 23 Feb 2026 22:03:51 +0100 Subject: cpufreq: governor: Use sysfs_emit() in sysfs show functions Replace sprintf() with sysfs_emit() in sysfs show functions. sysfs_emit() is preferred for formatting sysfs output because it provides safer bounds checking. No functional changes. Signed-off-by: Thorsten Blum Acked-by: Viresh Kumar [ rjw: Subject tweak ] Link: https://patch.msgid.link/20260223210351.344388-2-thorsten.blum@linux.dev Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_governor.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index 168c23fd7fca..732fd4d1a803 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -21,6 +21,7 @@ #include #include #include +#include /* Ondemand Sampling types */ enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE}; @@ -57,7 +58,7 @@ static ssize_t file_name##_show \ { \ struct dbs_data *dbs_data = to_dbs_data(attr_set); \ struct _gov##_dbs_tuners *tuners = dbs_data->tuners; \ - return sprintf(buf, "%u\n", tuners->file_name); \ + return sysfs_emit(buf, "%u\n", tuners->file_name); \ } #define gov_show_one_common(file_name) \ @@ -65,7 +66,7 @@ static ssize_t file_name##_show \ (struct gov_attr_set *attr_set, char *buf) \ { \ struct dbs_data *dbs_data = to_dbs_data(attr_set); \ - return sprintf(buf, "%u\n", dbs_data->file_name); \ + return sysfs_emit(buf, "%u\n", dbs_data->file_name); \ } #define gov_attr_ro(_name) \ -- cgit v1.2.3 From db5e64651a1b3ddf33a25ccfc1d28da1c427780b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 10 Mar 2026 11:43:27 +0530 Subject: cpufreq: Add MAINTAINERS entry for CPPC driver Add a MAINTAINERS entry for the CPPC driver, with a list of reviewers from multiple organizations. Also sort the list alphabetically while doing so. Signed-off-by: Viresh Kumar --- MAINTAINERS | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 55af015174a5..7622601a36f8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6620,6 +6620,17 @@ M: Bence Csókás S: Maintained F: drivers/i2c/busses/i2c-cp2615.c +CPU FREQUENCY DRIVERS - CPPC CPUFREQ +M: "Rafael J. Wysocki" +M: Viresh Kumar +R: Jie Zhan +R: Lifeng Zheng +R: Pierre Gondois +R: Sumit Gupta +L: linux-pm@vger.kernel.org +S: Maintained +F: drivers/cpufreq/cppc_cpufreq.c + CPU FREQUENCY DRIVERS - VEXPRESS SPC ARM BIG LITTLE M: Viresh Kumar M: Sudeep Holla @@ -6628,6 +6639,12 @@ S: Maintained W: http://www.arm.com/products/processors/technologies/biglittleprocessing.php F: drivers/cpufreq/vexpress-spc-cpufreq.c +CPU FREQUENCY DRIVERS - VIRTUAL MACHINE CPUFREQ +M: Saravana Kannan +L: linux-pm@vger.kernel.org +S: Maintained +F: drivers/cpufreq/virtual-cpufreq.c + CPU FREQUENCY SCALING FRAMEWORK M: "Rafael J. Wysocki" M: Viresh Kumar @@ -6647,12 +6664,6 @@ F: kernel/sched/cpufreq*.c F: rust/kernel/cpufreq.rs F: tools/testing/selftests/cpufreq/ -CPU FREQUENCY DRIVERS - VIRTUAL MACHINE CPUFREQ -M: Saravana Kannan -L: linux-pm@vger.kernel.org -S: Maintained -F: drivers/cpufreq/virtual-cpufreq.c - CPU HOTPLUG M: Thomas Gleixner M: Peter Zijlstra -- cgit v1.2.3 From d51de21b4c3a34a2cc592319df63864e14b18b29 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 9 Mar 2026 10:38:18 +0200 Subject: intel_idle: Add Panther Lake C-states table Panther Lake supports the following requestable C-states: C1, C1E, C6S, C10. The parameters of these C-states should be consistent across all systems based on Panther Lake, so add a custom C-states table for it that will override C-state parameters supplied by platform firmware that may vary from one platform to another and may not represent the most optimum choice. Signed-off-by: Artem Bityutskiy [ rjw: Changelog expansion ] Link: https://patch.msgid.link/20260309083818.79588-1-dedekind1@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index f49c939d636f..f49354e37777 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -983,6 +983,43 @@ static struct cpuidle_state mtl_l_cstates[] __initdata = { .enter = NULL } }; +static struct cpuidle_state ptl_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, + .target_residency = 10, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6S", + .desc = "MWAIT 0x21", + .flags = MWAIT2flg(0x21) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, + .target_residency = 300, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C10", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 370, + .target_residency = 2500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static struct cpuidle_state gmt_cstates[] __initdata = { { .name = "C1", @@ -1561,6 +1598,10 @@ static const struct idle_cpu idle_cpu_mtl_l __initconst = { .state_table = mtl_l_cstates, }; +static const struct idle_cpu idle_cpu_ptl __initconst = { + .state_table = ptl_cstates, +}; + static const struct idle_cpu idle_cpu_gmt __initconst = { .state_table = gmt_cstates, }; @@ -1669,6 +1710,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_VFM(INTEL_ALDERLAKE, &idle_cpu_adl), X86_MATCH_VFM(INTEL_ALDERLAKE_L, &idle_cpu_adl_l), X86_MATCH_VFM(INTEL_METEORLAKE_L, &idle_cpu_mtl_l), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &idle_cpu_ptl), X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &idle_cpu_gmt), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &idle_cpu_spr), -- cgit v1.2.3 From d6f8e0e06dee066076a45eea4f9f85247f85477d Mon Sep 17 00:00:00 2001 From: Faruque Ansari Date: Mon, 16 Mar 2026 16:07:52 +0530 Subject: cpufreq: Add QCS8300 to cpufreq-dt-platdev blocklist The Qualcomm QCS8300 platform uses the qcom-cpufreq-hw driver, so add it to the cpufreq-dt-platdev driver's blocklist. Signed-off-by: Faruque Ansari Reviewed-by: Mukesh Ojha Reviewed-by: Dmitry Baryshkov Reviewed-by: Konrad Dybcio Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt-platdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 25fd3b191b7e..ff1204c666b1 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -159,6 +159,7 @@ static const struct of_device_id blocklist[] __initconst = { { .compatible = "qcom,qcm2290", }, { .compatible = "qcom,qcm6490", }, { .compatible = "qcom,qcs404", }, + { .compatible = "qcom,qcs8300", }, { .compatible = "qcom,qdu1000", }, { .compatible = "qcom,sa8155p" }, { .compatible = "qcom,sa8540p" }, -- cgit v1.2.3 From 16c1e8385b3bb65d412d7a60107f8894587c63fa Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Mar 2026 15:25:44 -0400 Subject: cpufreq: optimize policy_is_shared() The switch to cpumask_nth() over cpumask_weight(), as it may return earlier - as soon as the function counts the required number of CPUs. Signed-off-by: Yury Norov Acked-by: Viresh Kumar Reviewed-by: Zhongqiu Han Link: https://patch.msgid.link/20260314192544.605914-1-ynorov@nvidia.com Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cc894fc38971..8ca2bcb3d7ae 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -232,7 +232,7 @@ static inline bool policy_is_inactive(struct cpufreq_policy *policy) static inline bool policy_is_shared(struct cpufreq_policy *policy) { - return cpumask_weight(policy->cpus) > 1; + return cpumask_nth(1, policy->cpus) < nr_cpumask_bits; } #ifdef CONFIG_CPU_FREQ -- cgit v1.2.3 From 9b0f1cd58fe9d2c95eae97f089040f1a5b02c097 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 14 Mar 2026 13:12:25 -0700 Subject: PM: hibernate: x86: Remove inclusion of crypto/hash.h hibernate_64.c does not do any cryptographic hashing, so the header crypto/hash.h is not needed at all. Signed-off-by: Eric Biggers [ rjw: Subject tweak ] Link: https://patch.msgid.link/20260314201225.38822-1-ebiggers@kernel.org Signed-off-by: Rafael J. Wysocki --- arch/x86/power/hibernate_64.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a595953f1d6d..e72d26acae79 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -14,8 +14,6 @@ #include #include -#include - #include #include #include -- cgit v1.2.3 From 8655a4e35cda5534f93303af393eac4e71704701 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Wed, 18 Mar 2026 12:12:33 -0700 Subject: cpufreq: tegra194: remove COMPILE_TEST The driver needs architecture specific headers to build. Problem gets exposed when TEGRA_BPMP gets COMPILE_TEST added to it. Signed-off-by: Rosen Penev Signed-off-by: Viresh Kumar --- drivers/cpufreq/Kconfig.arm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 4014bc9dd73a..47c9b031f1b3 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -248,7 +248,7 @@ config ARM_TEGRA186_CPUFREQ config ARM_TEGRA194_CPUFREQ tristate "Tegra194 CPUFreq support" - depends on ARCH_TEGRA_194_SOC || ARCH_TEGRA_234_SOC || (64BIT && COMPILE_TEST) + depends on ARCH_TEGRA_194_SOC || ARCH_TEGRA_234_SOC depends on TEGRA_BPMP default ARCH_TEGRA_194_SOC || ARCH_TEGRA_234_SOC help -- cgit v1.2.3 From 2b27ea5b644d7da9bc84f4539e53d1b31c601566 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 9 Mar 2026 18:39:42 +0100 Subject: PM: hibernate: return -ENODATA if the snapshot image is not loaded snapshot_image_loaded() is used in both the in-kernel and the userspace restore path to ensure that the snapshot image has been completely loaded. However the latter path returns -EPERM in such situations, which is meant for cases where the operation is neither write-only nor ready. This patch updates the check so the returned error code is -ENODATA in both cases. Suggested-by: Brian Geffon Signed-off-by: Alberto Garcia Acked-by: Brian Geffon Link: https://patch.msgid.link/8cfda38659c623f5392f3458cb32504ffd556a74.1773075892.git.berto@igalia.com Signed-off-by: Rafael J. Wysocki --- kernel/power/user.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/power/user.c b/kernel/power/user.c index 4401cfe26e5c..be77f3556bd7 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -322,11 +322,14 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = snapshot_write_finalize(&data->handle); if (error) break; - if (data->mode != O_WRONLY || !data->frozen || - !snapshot_image_loaded(&data->handle)) { + if (data->mode != O_WRONLY || !data->frozen) { error = -EPERM; break; } + if (!snapshot_image_loaded(&data->handle)) { + error = -ENODATA; + break; + } error = hibernation_restore(data->platform_support); break; -- cgit v1.2.3 From 16fb8d8a0e050e8f151da7dd2e03ccc500dfd8da Mon Sep 17 00:00:00 2001 From: Henry Tseng Date: Tue, 24 Mar 2026 17:09:48 +0800 Subject: cpufreq: acpi-cpufreq: use DMI max speed when CPPC is unavailable On AMD Ryzen Embedded V1780B (Family 17h, Zen 1), the BIOS does not provide ACPI _CPC objects and the CPU does not support MSR-based CPPC (X86_FEATURE_CPPC). The _PSS table only lists nominal P-states (P0 = 3350 MHz), so when get_max_boost_ratio() fails at cppc_get_perf_caps(), cpuinfo_max_freq reports only the base frequency instead of the rated boost frequency (3600 MHz). dmesg: ACPI CPPC: No CPC descriptor for CPU:0 acpi_cpufreq: CPU0: Unable to get performance capabilities (-19) cppc-cpufreq already has a DMI fallback (cppc_get_dmi_max_khz()) that reads the processor max speed from SMBIOS Type 4. Export it and reuse it in acpi-cpufreq as a last-resort source for the boost frequency. A sanity check ensures the DMI value is above the _PSS P0 frequency and within 2x of it; values outside that range are ignored and the existing arch_set_max_freq_ratio() path is taken instead. The 2x upper bound is based on a survey of the AMD Ryzen Embedded V1000 series, where the highest boost-to-base ratio is 1.8x (V1404I: 2.0 GHz base / 3.6 GHz boost). The DMI lookup and sanity check are wrapped in a helper, acpi_cpufreq_resolve_max_freq(), which falls through to arch_set_max_freq_ratio() if the DMI value is absent or out of range. Tested on AMD Ryzen Embedded V1780B with v7.0-rc4: Before: cpuinfo_max_freq = 3350000 (base only) After: cpuinfo_max_freq = 3600000 (includes boost) Link: https://www.amd.com/en/products/embedded/ryzen/ryzen-v1000-series.html#specifications Signed-off-by: Henry Tseng Link: https://patch.msgid.link/20260324090948.1667340-1-henrytseng@qnap.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 3 ++- drivers/cpufreq/acpi-cpufreq.c | 31 ++++++++++++++++++++++++------- include/acpi/cppc_acpi.h | 1 + 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index f0e513e9ed5d..f53de414acf2 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1944,7 +1944,7 @@ static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) } /* Look up the max frequency in DMI */ -static u64 cppc_get_dmi_max_khz(void) +u64 cppc_get_dmi_max_khz(void) { u16 mhz = 0; @@ -1958,6 +1958,7 @@ static u64 cppc_get_dmi_max_khz(void) return KHZ_PER_MHZ * mhz; } +EXPORT_SYMBOL_GPL(cppc_get_dmi_max_khz); /* * If CPPC lowest_freq and nominal_freq registers are exposed then we can diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index e7eff6c2f092..21639d9ac753 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -675,6 +675,29 @@ static inline u64 get_max_boost_ratio(unsigned int cpu, u64 *nominal_freq) } #endif +static void acpi_cpufreq_resolve_max_freq(struct cpufreq_policy *policy, + unsigned int pss_max_freq) +{ +#ifdef CONFIG_ACPI_CPPC_LIB + u64 max_speed = cppc_get_dmi_max_khz(); + /* + * Use DMI "Max Speed" if it looks plausible: must be + * above _PSS P0 frequency and within 2x of it. + */ + if (max_speed > pss_max_freq && max_speed < pss_max_freq * 2) { + policy->cpuinfo.max_freq = max_speed; + return; + } +#endif + /* + * If the maximum "boost" frequency is unknown, ask the arch + * scale-invariance code to use the "nominal" performance for + * CPU utilization scaling so as to prevent the schedutil + * governor from selecting inadequate CPU frequencies. + */ + arch_set_max_freq_ratio(true); +} + static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) { struct cpufreq_frequency_table *freq_table; @@ -849,13 +872,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.max_freq = freq * max_boost_ratio >> SCHED_CAPACITY_SHIFT; } else { - /* - * If the maximum "boost" frequency is unknown, ask the arch - * scale-invariance code to use the "nominal" performance for - * CPU utilization scaling so as to prevent the schedutil - * governor from selecting inadequate CPU frequencies. - */ - arch_set_max_freq_ratio(true); + acpi_cpufreq_resolve_max_freq(policy, freq_table[0].frequency); } policy->freq_table = freq_table; diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 4d644f03098e..e6c5ef3173c5 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -156,6 +156,7 @@ extern int cppc_set_enable(int cpu, bool enable); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); extern bool cppc_perf_ctrs_in_pcc_cpu(unsigned int cpu); extern bool cppc_perf_ctrs_in_pcc(void); +extern u64 cppc_get_dmi_max_khz(void); extern unsigned int cppc_perf_to_khz(struct cppc_perf_caps *caps, unsigned int perf); extern unsigned int cppc_khz_to_perf(struct cppc_perf_caps *caps, unsigned int freq); extern bool acpi_cpc_valid(void); -- cgit v1.2.3 From 193f41dad3d4197b35be8b4cff13c606eb0e1efe Mon Sep 17 00:00:00 2001 From: Roberto Ricci Date: Tue, 24 Mar 2026 23:39:01 +0100 Subject: cpupower-idle-info.1: fix short option names The cpupower-idle-info(1) man page describes '-f' as the short form of the '--silent' option and '-e' as the short form of the '--proc' option. But they are not correct: $ cpupower idle-info -f idle-info: invalid option -- 'f' invalid or unknown argument $ cpupower idle-info -e idle-info: invalid option -- 'e' invalid or unknown argument The short form of '--silent' is actually '-s' and the short form of '--proc' is actually '-o': cpuidle-info.c: {"silent", no_argument, NULL, 's'}, {"proc", no_argument, NULL, 'o'}, Signed-off-by: Roberto Ricci Link: https://lore.kernel.org/r/20260324223921.14317-2-io@r-ricci.it Signed-off-by: Shuah Khan --- tools/power/cpupower/man/cpupower-idle-info.1 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/power/cpupower/man/cpupower-idle-info.1 b/tools/power/cpupower/man/cpupower-idle-info.1 index 20b6345c53ad..b2f92aba5f5b 100644 --- a/tools/power/cpupower/man/cpupower-idle-info.1 +++ b/tools/power/cpupower/man/cpupower-idle-info.1 @@ -11,10 +11,10 @@ A tool which prints out per cpu idle information helpful to developers and inter .SH "OPTIONS" .LP .TP -\fB\-f\fR \fB\-\-silent\fR +\fB\-s\fR \fB\-\-silent\fR Only print a summary of all available C-states in the system. .TP -\fB\-e\fR \fB\-\-proc\fR +\fB\-o\fR \fB\-\-proc\fR deprecated. Prints out idle information in old /proc/acpi/processor/*/power format. This interface has been removed from the kernel for quite some time, do not let -- cgit v1.2.3 From 10a54e715985baac21bb506f86c5ad1b83a12784 Mon Sep 17 00:00:00 2001 From: Roberto Ricci Date: Tue, 24 Mar 2026 23:39:02 +0100 Subject: cpupower-frequency-info.1: use the proper name of the --perf option The cpupower-frequency-info(1) man page describes a '--perf' option. Even though this form is accepted by the program, its proper name is '--performance'. cpufreq-info.c: {"performance", no_argument, NULL, 'c'}, Signed-off-by: Roberto Ricci Link: https://lore.kernel.org/r/20260324223921.14317-3-io@r-ricci.it Signed-off-by: Shuah Khan --- tools/power/cpupower/man/cpupower-frequency-info.1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/cpupower/man/cpupower-frequency-info.1 b/tools/power/cpupower/man/cpupower-frequency-info.1 index 47fdd7218748..1173d4f31e69 100644 --- a/tools/power/cpupower/man/cpupower-frequency-info.1 +++ b/tools/power/cpupower/man/cpupower-frequency-info.1 @@ -53,7 +53,7 @@ human\-readable output for the \-f, \-w, \-s and \-y parameters. \fB\-n\fR \fB\-\-no-rounding\fR Output frequencies and latencies without rounding off values. .TP -\fB\-c\fR \fB\-\-perf\fR +\fB\-c\fR \fB\-\-performance\fR Get performances and frequencies capabilities of CPPC, by reading it from hardware (only available on the hardware with CPPC). .TP .SH "REMARKS" -- cgit v1.2.3 From ed2946802bad2ec997383d22edfc14ae44245183 Mon Sep 17 00:00:00 2001 From: Roberto Ricci Date: Tue, 24 Mar 2026 23:39:03 +0100 Subject: cpupower-frequency-info.1: document --boost and --epp options `cpupower frequency-info` supports the '--boost' option since the program was first added with commit 7fe2f6399a84 ("cpupowerutils - cpufrequtils extended with quite some features"), but the man page lacks it. '--epp' has been added with commit 5f567afc283f ("cpupower: Add support for showing energy performance preference") but it has never been added to the man page. cpufreq-info.c: {"boost", no_argument, NULL, 'b'}, ... {"epp", no_argument, NULL, 'z'}, Signed-off-by: Roberto Ricci Link: https://lore.kernel.org/r/20260324223921.14317-4-io@r-ricci.it Signed-off-by: Shuah Khan --- tools/power/cpupower/man/cpupower-frequency-info.1 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/power/cpupower/man/cpupower-frequency-info.1 b/tools/power/cpupower/man/cpupower-frequency-info.1 index 1173d4f31e69..b0d69c9adcbd 100644 --- a/tools/power/cpupower/man/cpupower-frequency-info.1 +++ b/tools/power/cpupower/man/cpupower-frequency-info.1 @@ -32,6 +32,12 @@ Gets the currently used cpufreq policy. \fB\-g\fR \fB\-\-governors\fR Determines available cpufreq governors. .TP +\fB\-b\fR \fB\-\-boost\fR +Gets the current boost state support. +.TP +\fB\-z\fR \fB\-\-epp\fR +Gets the current EPP (energy performance preference). +.TP \fB\-r\fR \fB\-\-related\-cpus\fR Determines which CPUs run at the same hardware frequency. .TP -- cgit v1.2.3 From 8bbd81ddbe174aa8488db7971fe2717cb636a46c Mon Sep 17 00:00:00 2001 From: Roberto Ricci Date: Tue, 24 Mar 2026 23:39:04 +0100 Subject: cpupower-info.1: describe the --perf-bias option The cpupower-info(1) man page only mentions the short form of the '--perf-bias' option in the synopsys, but the long form is not documented and its effect is not explained. cpupower-info.c: {"perf-bias", optional_argument, NULL, 'b'}, Signed-off-by: Roberto Ricci Link: https://lore.kernel.org/r/20260324223921.14317-5-io@r-ricci.it Signed-off-by: Shuah Khan --- tools/power/cpupower/man/cpupower-info.1 | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/power/cpupower/man/cpupower-info.1 b/tools/power/cpupower/man/cpupower-info.1 index 340bcd0be7de..1f42d8c388a0 100644 --- a/tools/power/cpupower/man/cpupower-info.1 +++ b/tools/power/cpupower/man/cpupower-info.1 @@ -3,7 +3,7 @@ cpupower\-info \- Shows processor power related kernel or hardware configurations .SH SYNOPSIS .ft B -.B cpupower info [ \-b ] +.B cpupower info [\fIoptions\fP] .SH DESCRIPTION \fBcpupower info \fP shows kernel configurations or processor hardware @@ -13,6 +13,13 @@ Some options are platform wide, some affect single cores. By default values of core zero are displayed only. cpupower --cpu all cpuinfo will show the settings of all cores, see cpupower(1) how to choose specific cores. +.SH "OPTIONS" +.LP +.TP +\fB\-b\fR \fB\-\-perf-bias\fR +Gets the current performance bias value. +.TP + .SH "SEE ALSO" Options are described in detail in: -- cgit v1.2.3 From f3b536878a3cf47e5193a96176a3ca2aaf0d848f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 11 Mar 2026 22:14:44 -0700 Subject: powercap: correct kernel-doc function parameter names Use the correct function parameter names in kernel-doc comments to avoid these warnings: Warning: include/linux/powercap.h:254 function parameter 'name' not described in 'powercap_register_control_type' Warning: include/linux/powercap.h:298 function parameter 'nr_constraints' not described in 'powercap_register_zone' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260312051444.685136-1-rdunlap@infradead.org Signed-off-by: Rafael J. Wysocki --- include/linux/powercap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/powercap.h b/include/linux/powercap.h index 3d557bbcd2c7..603419db924c 100644 --- a/include/linux/powercap.h +++ b/include/linux/powercap.h @@ -238,7 +238,7 @@ static inline void *powercap_get_zone_data(struct powercap_zone *power_zone) * Advantage of this parameter is that client can embed * this data in its data structures and allocate in a * single call, preventing multiple allocations. -* @control_type_name: The Name of this control_type, which will be shown +* @name: The Name of this control_type, which will be shown * in the sysfs Interface. * @ops: Callbacks for control type. This parameter is optional. * @@ -277,7 +277,7 @@ int powercap_unregister_control_type(struct powercap_control_type *instance); * @name: A name for this zone. * @parent: A pointer to the parent power zone instance if any or NULL * @ops: Pointer to zone operation callback structure. -* @no_constraints: Number of constraints for this zone +* @nr_constraints: Number of constraints for this zone * @const_ops: Pointer to constraint callback structure * * Register a power zone under a given control type. A power zone must register -- cgit v1.2.3 From 8765715b4e8a1dd24ab5d507c42fc0bcd3d83f5c Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Fri, 13 Mar 2026 11:53:27 -0700 Subject: powercap: intel_rapl: Remove unused AVERAGE_POWER primitive The AVERAGE_POWER primitive and RAPL_PRIMITIVE_DERIVED flag are not used anywhere in the code. Remove them to simplify the primitive handling logic. No functional changes. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260313185333.2370733-2-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 13 ------------- include/linux/intel_rapl.h | 1 - 2 files changed, 14 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 6a2153039f73..e099514e6c56 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -90,7 +90,6 @@ #define TPMI_INFO_MAX_TIME_WIN_MASK GENMASK_ULL(60, 54) /* Non HW constants */ -#define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ #define RAPL_PRIMITIVE_DUMMY BIT(2) #define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */ @@ -703,9 +702,6 @@ static struct rapl_primitive_info rpi_msr[NR_RAPL_PRIMITIVES] = { 19, RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), [PSYS_TIME_WINDOW2] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51, RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), - /* non-hardware */ - [AVERAGE_POWER] = PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, - RAPL_PRIMITIVE_DERIVED), }; /* RAPL primitives for TPMI I/F */ @@ -745,9 +741,6 @@ static struct rapl_primitive_info rpi_tpmi[NR_RAPL_PRIMITIVES] = { 54, RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), [THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0, RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), - /* non-hardware */ - [AVERAGE_POWER] = PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, - RAPL_PRIMITIVE_DERIVED), }; static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim) @@ -841,12 +834,6 @@ static int rapl_read_data_raw(struct rapl_domain *rd, if (!ra.reg.val) return -EINVAL; - /* non-hardware data are collected by the polling thread */ - if (rpi->flag & RAPL_PRIMITIVE_DERIVED) { - *data = rd->rdd.primitives[prim]; - return 0; - } - ra.mask = rpi->mask; if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, pmu_ctx)) { diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 6d694099a3ad..9e6bd654be1f 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -77,7 +77,6 @@ enum rapl_primitives { PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW2, /* below are not raw primitive data */ - AVERAGE_POWER, NR_RAPL_PRIMITIVES, }; -- cgit v1.2.3 From 04aa9d0726cc6a23b348498815a9722b42d27c91 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 26 Mar 2026 21:44:00 +0100 Subject: cpufreq: Remove max_freq_req update for pre-existing policy policy->max_freq_req QoS constraint represents the maximal allowed frequency than can be requested. It is set by: - writing to policyX/scaling_max sysfs file - toggling the cpufreq/boost sysfs file Upon calling freq_qos_update_request(), a successful update of the max_freq_req value triggers cpufreq_notifier_max(), followed by cpufreq_set_policy() which update the requested frequency for the policy. If the new max_freq_req value is not different from the original value, no frequency update is triggered. In a specific sequence of toggling: - cpufreq/boost sysfs file - CPU hot-plugging a CPU could end up with boost enabled but running at the maximal non-boost frequency, cpufreq_notifier_max() not being triggered. The following fixed that: commit 1608f0230510 ("cpufreq: Fix re-boost issue after hotplugging a CPU") The following: commit dd016f379ebc ("cpufreq: Introduce a more generic way to set default per-policy boost flag") also fixed the issue by correctly setting the max_freq_req constraint of a policy that is re-activated. This makes the first fix unnecessary. As the original issue is fixed by another method, this patch reverts: commit 1608f0230510 ("cpufreq: Fix re-boost issue after hotplugging a CPU") Reviewed-by: Lifeng Zheng Signed-off-by: Pierre Gondois Acked-by: Viresh Kumar Link: https://patch.msgid.link/20260326204404.1401849-2-pierre.gondois@arm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 0f926ee23cdb..b127f5cb682c 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1484,10 +1484,6 @@ static int cpufreq_policy_online(struct cpufreq_policy *policy, blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_CREATE_POLICY, policy); - } else { - ret = freq_qos_update_request(policy->max_freq_req, policy->max); - if (ret < 0) - goto out_destroy_policy; } if (cpufreq_driver->get && has_target()) { -- cgit v1.2.3 From 6e39ba4e5a82aa5469b2ac517b74a71accb0540f Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 26 Mar 2026 21:44:01 +0100 Subject: cpufreq: Add boost_freq_req QoS request The Power Management Quality of Service (PM QoS) allows to aggregate constraints from multiple entities. It is currently used to manage the min/max frequency of a given policy. Frequency constraints can come for instance from: - Thermal framework: acpi_thermal_cpufreq_init() - Firmware: _PPC objects: acpi_processor_ppc_init() - User: by setting policyX/scaling_[min|max]_freq The minimum of the max frequency constraints is used to compute the resulting maximum allowed frequency. When enabling boost frequencies, the same frequency request object (policy->max_freq_req) as to handle requests from users is used. As a result, when setting: - scaling_max_freq - boost The last sysfs file used overwrites the request from the other sysfs file. To avoid this, create a per-policy boost_freq_req to save the boost constraints instead of overwriting the last scaling_max_freq constraint. policy_set_boost() calls the cpufreq set_boost callback. Update the newly added boost_freq_req request from there: - whenever boost is toggled - to cover all possible paths In the existing .set_boost() callbacks: - Don't update policy->max as this is done through the qos notifier cpufreq_notifier_max() which calls cpufreq_set_policy(). - Remove freq_qos_update_request() calls as the qos request is now done in policy_set_boost() and updates the new boost_freq_req $ ## Init state scaling_max_freq:1000000 cpuinfo_max_freq:1000000 $ echo 700000 > scaling_max_freq scaling_max_freq:700000 cpuinfo_max_freq:1000000 $ echo 1 > ../boost scaling_max_freq:1200000 cpuinfo_max_freq:1200000 $ echo 800000 > scaling_max_freq scaling_max_freq:800000 cpuinfo_max_freq:1200000 $ ## Final step: $ ## Without the patches: $ echo 0 > ../boost scaling_max_freq:1000000 cpuinfo_max_freq:1000000 $ ## With the patches: $ echo 0 > ../boost scaling_max_freq:800000 cpuinfo_max_freq:1000000 Note: cpufreq_frequency_table_cpuinfo() updates policy->min and max from: A. cpufreq_boost_set_sw() \-cpufreq_frequency_table_cpuinfo() B. cpufreq_policy_online() \-cpufreq_table_validate_and_sort() \-cpufreq_frequency_table_cpuinfo() Keep these updates as some drivers expect policy->min and max to be set through B. Reviewed-by: Lifeng Zheng Signed-off-by: Pierre Gondois Acked-by: Viresh Kumar Link: https://patch.msgid.link/20260326204404.1401849-3-pierre.gondois@arm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 2 -- drivers/cpufreq/cppc_cpufreq.c | 10 ++------- drivers/cpufreq/cpufreq.c | 46 ++++++++++++++++++++++++++++-------------- include/linux/cpufreq.h | 1 + 4 files changed, 34 insertions(+), 25 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5aa9fcd80cf5..d0675d6a19fe 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -769,8 +769,6 @@ static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) else if (policy->cpuinfo.max_freq > nominal_freq) policy->cpuinfo.max_freq = nominal_freq; - policy->max = policy->cpuinfo.max_freq; - if (cppc_state == AMD_PSTATE_PASSIVE) { ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq); if (ret < 0) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 011f35cb47b9..f4f574fbe547 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -807,17 +807,11 @@ static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) { struct cppc_cpudata *cpu_data = policy->driver_data; struct cppc_perf_caps *caps = &cpu_data->perf_caps; - int ret; if (state) - policy->max = cppc_perf_to_khz(caps, caps->highest_perf); + policy->cpuinfo.max_freq = cppc_perf_to_khz(caps, caps->highest_perf); else - policy->max = cppc_perf_to_khz(caps, caps->nominal_perf); - policy->cpuinfo.max_freq = policy->max; - - ret = freq_qos_update_request(policy->max_freq_req, policy->max); - if (ret < 0) - return ret; + policy->cpuinfo.max_freq = cppc_perf_to_khz(caps, caps->nominal_perf); return 0; } diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b127f5cb682c..c0aa970c7a67 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -609,10 +609,19 @@ static int policy_set_boost(struct cpufreq_policy *policy, bool enable) policy->boost_enabled = enable; ret = cpufreq_driver->set_boost(policy, enable); - if (ret) + if (ret) { policy->boost_enabled = !policy->boost_enabled; + return ret; + } - return ret; + ret = freq_qos_update_request(policy->boost_freq_req, policy->cpuinfo.max_freq); + if (ret < 0) { + policy->boost_enabled = !policy->boost_enabled; + cpufreq_driver->set_boost(policy, policy->boost_enabled); + return ret; + } + + return 0; } static ssize_t store_local_boost(struct cpufreq_policy *policy, @@ -1377,6 +1386,7 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy) } freq_qos_remove_request(policy->min_freq_req); + freq_qos_remove_request(policy->boost_freq_req); kfree(policy->min_freq_req); cpufreq_policy_put_kobj(policy); @@ -1442,26 +1452,38 @@ static int cpufreq_policy_online(struct cpufreq_policy *policy, cpumask_and(policy->cpus, policy->cpus, cpu_online_mask); if (new_policy) { + unsigned int count; + for_each_cpu(j, policy->related_cpus) { per_cpu(cpufreq_cpu_data, j) = policy; add_cpu_dev_symlink(policy, j, get_cpu_device(j)); } - policy->min_freq_req = kzalloc(2 * sizeof(*policy->min_freq_req), + count = policy->boost_supported ? 3 : 2; + policy->min_freq_req = kzalloc(count * sizeof(*policy->min_freq_req), GFP_KERNEL); if (!policy->min_freq_req) { ret = -ENOMEM; goto out_destroy_policy; } + if (policy->boost_supported) { + policy->boost_freq_req = policy->min_freq_req + 2; + + ret = freq_qos_add_request(&policy->constraints, + policy->boost_freq_req, + FREQ_QOS_MAX, + policy->cpuinfo.max_freq); + if (ret < 0) { + policy->boost_freq_req = NULL; + goto out_destroy_policy; + } + } + ret = freq_qos_add_request(&policy->constraints, policy->min_freq_req, FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE); if (ret < 0) { - /* - * So we don't call freq_qos_remove_request() for an - * uninitialized request. - */ kfree(policy->min_freq_req); policy->min_freq_req = NULL; goto out_destroy_policy; @@ -2785,16 +2807,10 @@ int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) return -ENXIO; ret = cpufreq_frequency_table_cpuinfo(policy); - if (ret) { + if (ret) pr_err("%s: Policy frequency update failed\n", __func__); - return ret; - } - - ret = freq_qos_update_request(policy->max_freq_req, policy->max); - if (ret < 0) - return ret; - return 0; + return ret; } EXPORT_SYMBOL_GPL(cpufreq_boost_set_sw); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 8ca2bcb3d7ae..b6f6c7d06912 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -81,6 +81,7 @@ struct cpufreq_policy { struct freq_constraints constraints; struct freq_qos_request *min_freq_req; struct freq_qos_request *max_freq_req; + struct freq_qos_request *boost_freq_req; struct cpufreq_frequency_table *freq_table; enum cpufreq_table_sorting freq_table_sorted; -- cgit v1.2.3 From 9266b4da051a410d9e6c5c0b0ef0c877855aa1b8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 31 Mar 2026 10:33:46 +0530 Subject: cpufreq: Allocate QoS freq_req objects with policy A recent change exposed a bug in the error path: if freq_qos_add_request(boost_freq_req) fails, min_freq_req may remain a valid pointer even though it was never successfully added. During policy teardown, this leads to an unconditional call to freq_qos_remove_request(), triggering a WARN. The current design allocates all three freq_req objects together, making the lifetime rules unclear and error handling fragile. Simplify this by allocating the QoS freq_req objects at policy allocation time. The policy itself is dynamically allocated, and two of the three requests are always needed anyway. This ensures consistent lifetime management and eliminates the inconsistent state in failure paths. Reported-by: Zhongqiu Han Fixes: 6e39ba4e5a82 ("cpufreq: Add boost_freq_req QoS request") Signed-off-by: Viresh Kumar Reviewed-by: Lifeng Zheng Tested-by: Pierre Gondois Reviewed-by: Zhongqiu Han Link: https://patch.msgid.link/a293f29d841b86c51f34699c6e717e01858d8ada.1774933424.git.viresh.kumar@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 53 +++++++++++++---------------------------------- include/linux/cpufreq.h | 6 +++--- 2 files changed, 17 insertions(+), 42 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c0aa970c7a67..f4a949f1e48f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -614,7 +614,7 @@ static int policy_set_boost(struct cpufreq_policy *policy, bool enable) return ret; } - ret = freq_qos_update_request(policy->boost_freq_req, policy->cpuinfo.max_freq); + ret = freq_qos_update_request(&policy->boost_freq_req, policy->cpuinfo.max_freq); if (ret < 0) { policy->boost_enabled = !policy->boost_enabled; cpufreq_driver->set_boost(policy, policy->boost_enabled); @@ -769,7 +769,7 @@ static ssize_t store_##file_name \ if (ret) \ return ret; \ \ - ret = freq_qos_update_request(policy->object##_freq_req, val);\ + ret = freq_qos_update_request(&policy->object##_freq_req, val); \ return ret >= 0 ? count : ret; \ } @@ -1374,7 +1374,7 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy) /* Cancel any pending policy->update work before freeing the policy. */ cancel_work_sync(&policy->update); - if (policy->max_freq_req) { + if (freq_qos_request_active(&policy->max_freq_req)) { /* * Remove max_freq_req after sending CPUFREQ_REMOVE_POLICY * notification, since CPUFREQ_CREATE_POLICY notification was @@ -1382,12 +1382,13 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy) */ blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_REMOVE_POLICY, policy); - freq_qos_remove_request(policy->max_freq_req); + freq_qos_remove_request(&policy->max_freq_req); } - freq_qos_remove_request(policy->min_freq_req); - freq_qos_remove_request(policy->boost_freq_req); - kfree(policy->min_freq_req); + if (freq_qos_request_active(&policy->min_freq_req)) + freq_qos_remove_request(&policy->min_freq_req); + if (freq_qos_request_active(&policy->boost_freq_req)) + freq_qos_remove_request(&policy->boost_freq_req); cpufreq_policy_put_kobj(policy); free_cpumask_var(policy->real_cpus); @@ -1452,57 +1453,31 @@ static int cpufreq_policy_online(struct cpufreq_policy *policy, cpumask_and(policy->cpus, policy->cpus, cpu_online_mask); if (new_policy) { - unsigned int count; - for_each_cpu(j, policy->related_cpus) { per_cpu(cpufreq_cpu_data, j) = policy; add_cpu_dev_symlink(policy, j, get_cpu_device(j)); } - count = policy->boost_supported ? 3 : 2; - policy->min_freq_req = kzalloc(count * sizeof(*policy->min_freq_req), - GFP_KERNEL); - if (!policy->min_freq_req) { - ret = -ENOMEM; - goto out_destroy_policy; - } - if (policy->boost_supported) { - policy->boost_freq_req = policy->min_freq_req + 2; - ret = freq_qos_add_request(&policy->constraints, - policy->boost_freq_req, + &policy->boost_freq_req, FREQ_QOS_MAX, policy->cpuinfo.max_freq); - if (ret < 0) { - policy->boost_freq_req = NULL; + if (ret < 0) goto out_destroy_policy; - } } ret = freq_qos_add_request(&policy->constraints, - policy->min_freq_req, FREQ_QOS_MIN, + &policy->min_freq_req, FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE); - if (ret < 0) { - kfree(policy->min_freq_req); - policy->min_freq_req = NULL; + if (ret < 0) goto out_destroy_policy; - } - - /* - * This must be initialized right here to avoid calling - * freq_qos_remove_request() on uninitialized request in case - * of errors. - */ - policy->max_freq_req = policy->min_freq_req + 1; ret = freq_qos_add_request(&policy->constraints, - policy->max_freq_req, FREQ_QOS_MAX, + &policy->max_freq_req, FREQ_QOS_MAX, FREQ_QOS_MAX_DEFAULT_VALUE); - if (ret < 0) { - policy->max_freq_req = NULL; + if (ret < 0) goto out_destroy_policy; - } blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_CREATE_POLICY, policy); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b6f6c7d06912..9b10eb486ece 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -79,9 +79,9 @@ struct cpufreq_policy { * called, but you're in IRQ context */ struct freq_constraints constraints; - struct freq_qos_request *min_freq_req; - struct freq_qos_request *max_freq_req; - struct freq_qos_request *boost_freq_req; + struct freq_qos_request min_freq_req; + struct freq_qos_request max_freq_req; + struct freq_qos_request boost_freq_req; struct cpufreq_frequency_table *freq_table; enum cpufreq_table_sorting freq_table_sorted; -- cgit v1.2.3 From 2e00c2dcc5325af04e2dfbb29281ced1c724ab81 Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Tue, 31 Mar 2026 08:42:42 +0100 Subject: cpufreq: clean up dead code in Kconfig There is already an 'if CPU_FREQ' condition wrapping these config options, making the 'depends on' statement for each a duplicate dependency (dead code). Leave the outer 'if CPU_FREQ...endif' and remove the individual 'depends on' statement from each option. This dead code was found by kconfirm, a static analysis tool for Kconfig. Signed-off-by: Julian Braha Acked-by: Viresh Kumar [ rjw: Subject and changelog edits ] Link: https://patch.msgid.link/20260331074242.39986-1-julianbraha@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 78702a08364f..db83f3365698 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -163,7 +163,6 @@ config CPU_FREQ_GOV_ONDEMAND config CPU_FREQ_GOV_CONSERVATIVE tristate "'conservative' cpufreq governor" - depends on CPU_FREQ select CPU_FREQ_GOV_COMMON help 'conservative' - this driver is rather similar to the 'ondemand' @@ -188,7 +187,7 @@ config CPU_FREQ_GOV_CONSERVATIVE config CPU_FREQ_GOV_SCHEDUTIL bool "'schedutil' cpufreq policy governor" - depends on CPU_FREQ && SMP + depends on SMP select CPU_FREQ_GOV_ATTR_SET select IRQ_WORK help @@ -365,6 +364,6 @@ config ACPI_CPPC_CPUFREQ_FIE If in doubt, say N. -endif +endif # CPU_FREQ endmenu -- cgit v1.2.3 From 9d3a068cc80c9ed374cb78ab8c861668c835a6bc Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Tue, 31 Mar 2026 08:49:20 +0100 Subject: cpuidle: clean up dead dependencies on CPU_IDLE in Kconfig The Kconfig in the parent directory already has the first 'if CPU_IDLE' gating the inclusion of this Kconfig, meaning that the 'depends on CPUIDLE' statements in these config options are effectively dead code. Leave the 'if CPU_IDLE...endif' condition, and remove the individual 'depends on' statements in Kconfig.mips and Kconfig.powerpc This dead code was found by kconfirm, a static analysis tool for Kconfig. Signed-off-by: Julian Braha [ rjw: Subject and changelog edits ] Link: https://patch.msgid.link/20260331074920.41269-1-julianbraha@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/Kconfig | 2 +- drivers/cpuidle/Kconfig.mips | 2 +- drivers/cpuidle/Kconfig.powerpc | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index cac5997dca50..d6d8386d3f02 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -81,7 +81,7 @@ config HALTPOLL_CPUIDLE before halting in the guest (more efficient than polling in the host via halt_poll_ns for some scenarios). -endif +endif # CPU_IDLE config ARCH_NEEDS_CPU_IDLE_COUPLED def_bool n diff --git a/drivers/cpuidle/Kconfig.mips b/drivers/cpuidle/Kconfig.mips index c3c011af4a35..88728b2b4ea0 100644 --- a/drivers/cpuidle/Kconfig.mips +++ b/drivers/cpuidle/Kconfig.mips @@ -4,7 +4,7 @@ # config MIPS_CPS_CPUIDLE bool "CPU Idle driver for MIPS CPS platforms" - depends on CPU_IDLE && MIPS_CPS + depends on MIPS_CPS depends on SYS_SUPPORTS_MIPS_CPS select ARCH_NEEDS_CPU_IDLE_COUPLED if MIPS_MT || CPU_MIPSR6 select GENERIC_CLOCKEVENTS_BROADCAST if SMP diff --git a/drivers/cpuidle/Kconfig.powerpc b/drivers/cpuidle/Kconfig.powerpc index a797a02b7b6f..1931ac8faffb 100644 --- a/drivers/cpuidle/Kconfig.powerpc +++ b/drivers/cpuidle/Kconfig.powerpc @@ -4,7 +4,6 @@ # config PSERIES_CPUIDLE bool "Cpuidle driver for pSeries platforms" - depends on CPU_IDLE depends on PPC_PSERIES default y help @@ -13,7 +12,6 @@ config PSERIES_CPUIDLE config POWERNV_CPUIDLE bool "Cpuidle driver for powernv platforms" - depends on CPU_IDLE depends on PPC_POWERNV default y help -- cgit v1.2.3 From e648c7acc1e3507520af16cf5e6e9472878ec0a4 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:44 -0700 Subject: powercap: intel_rapl: Move MSR default settings into MSR interface driver MSR-specific RAPL defaults differ from those used by the TPMI interface. The MMIO and MSR interfaces shared the same rapl_defaults pointer in the common driver, but MMIO does not require the CPU-specific variations needed by MSR. Keeping these in the common driver adds unnecessary complexity and MSR-specific initialization. Move MSR defaults and CPU matching into the MSR interface driver. Moves ----- * Move rapl_check_unit_atom(), set_floor_freq_atom(), and rapl_compute_time_window_atom() into intel_rapl_msr.c. * Move MSR unit-field GENMASK definitions and local constants. * Move all MSR-related rapl_defaults tables and the CPU-ID matching logic (rapl_ids[]) into the MSR driver. * Move iosf_mbi dependencies (floor-frequency control and related MBI register definitions) as they are MSR-platform specific. Modifications ------------- * Replace the common driver's platform-device manual alloc/add sequence with platform_device_register_data() in the MSR driver to pass matching rapl_defaults as platform_data. * Update MSR driver probe to assign pdev->dev.platform_data to priv->defaults. * Update Atom helper functions to use rp->lead_cpu directly for MSR reads/writes instead of the generic get_rid(). * Update Atom floor frequency logic to access defaults via the package private data pointer. * Convert MSR device creation from fs_initcall() to module_init(). This preserves existing enumeration behavior as the driver was already using module_init(). * Since rapl_ids need to exist after boot, remove __initconst specifier. No functional changes are expected. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260331211950.3329932-2-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 228 +------------------------------- drivers/powercap/intel_rapl_msr.c | 250 ++++++++++++++++++++++++++++++++++- 2 files changed, 250 insertions(+), 228 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index e099514e6c56..1e8146283009 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -28,7 +28,6 @@ #include #include -#include #include /* bitmasks for RAPL MSRs, used by primitive access functions */ @@ -212,8 +211,6 @@ static int get_pl_prim(struct rapl_domain *rd, int pl, enum pl_prims prim) #define power_zone_to_rapl_domain(_zone) \ container_of(_zone, struct rapl_domain, power_zone) -static const struct rapl_defaults *defaults_msr; - static const struct rapl_defaults *get_defaults(struct rapl_package *rp) { return rp->priv->defaults; @@ -759,7 +756,6 @@ static int rapl_config(struct rapl_package *rp) /* MMIO I/F shares the same register layout as MSR registers */ case RAPL_IF_MMIO: case RAPL_IF_MSR: - rp->priv->defaults = defaults_msr; rp->priv->rpi = (void *)rpi_msr; break; case RAPL_IF_TPMI: @@ -947,34 +943,6 @@ int rapl_default_check_unit(struct rapl_domain *rd) } EXPORT_SYMBOL_NS_GPL(rapl_default_check_unit, "INTEL_RAPL"); -static int rapl_check_unit_atom(struct rapl_domain *rd) -{ - struct reg_action ra; - u32 value; - - ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; - ra.mask = ~0; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { - pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", - ra.reg.val, rd->rp->name, rd->name); - return -ENODEV; - } - - value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; - rd->energy_unit = ENERGY_UNIT_SCALE * (1ULL << value); - - value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; - rd->power_unit = (1ULL << value) * MILLIWATT_PER_WATT; - - value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; - rd->time_unit = USEC_PER_SEC >> value; - - pr_debug("Atom %s:%s energy=%dpJ, time=%dus, power=%duW\n", - rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); - - return 0; -} - static void power_limit_irq_save_cpu(void *info) { u32 l, h = 0; @@ -1055,30 +1023,6 @@ void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode) } EXPORT_SYMBOL_NS_GPL(rapl_default_set_floor_freq, "INTEL_RAPL"); -static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) -{ - static u32 power_ctrl_orig_val; - const struct rapl_defaults *defaults = get_defaults(rd->rp); - u32 mdata; - - if (!defaults->floor_freq_reg_addr) { - pr_err("Invalid floor frequency config register\n"); - return; - } - - if (!power_ctrl_orig_val) - iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ, - defaults->floor_freq_reg_addr, - &power_ctrl_orig_val); - mdata = power_ctrl_orig_val; - if (enable) { - mdata &= ~GENMASK(14, 8); - mdata |= BIT(8); - } - iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE, - defaults->floor_freq_reg_addr, mdata); -} - u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw) { u64 f, y; /* fraction and exp. used for time unit */ @@ -1112,149 +1056,6 @@ u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_ } EXPORT_SYMBOL_NS_GPL(rapl_default_compute_time_window, "INTEL_RAPL"); -static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value, - bool to_raw) -{ - if (to_raw) - return div64_u64(value, rd->time_unit); - - /* - * Atom time unit encoding is straight forward val * time_unit, - * where time_unit is default to 1 sec. Never 0. - */ - return (value) ? value * rd->time_unit : rd->time_unit; -} - -static const struct rapl_defaults rapl_defaults_core = { - .floor_freq_reg_addr = 0, - .check_unit = rapl_default_check_unit, - .set_floor_freq = rapl_default_set_floor_freq, - .compute_time_window = rapl_default_compute_time_window, -}; - -static const struct rapl_defaults rapl_defaults_hsw_server = { - .check_unit = rapl_default_check_unit, - .set_floor_freq = rapl_default_set_floor_freq, - .compute_time_window = rapl_default_compute_time_window, - .dram_domain_energy_unit = 15300, -}; - -static const struct rapl_defaults rapl_defaults_spr_server = { - .check_unit = rapl_default_check_unit, - .set_floor_freq = rapl_default_set_floor_freq, - .compute_time_window = rapl_default_compute_time_window, - .psys_domain_energy_unit = NANOJOULE_PER_JOULE, - .spr_psys_bits = true, -}; - -static const struct rapl_defaults rapl_defaults_byt = { - .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT, - .check_unit = rapl_check_unit_atom, - .set_floor_freq = set_floor_freq_atom, - .compute_time_window = rapl_compute_time_window_atom, -}; - -static const struct rapl_defaults rapl_defaults_tng = { - .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG, - .check_unit = rapl_check_unit_atom, - .set_floor_freq = set_floor_freq_atom, - .compute_time_window = rapl_compute_time_window_atom, -}; - -static const struct rapl_defaults rapl_defaults_ann = { - .floor_freq_reg_addr = 0, - .check_unit = rapl_check_unit_atom, - .set_floor_freq = NULL, - .compute_time_window = rapl_compute_time_window_atom, -}; - -static const struct rapl_defaults rapl_defaults_cht = { - .floor_freq_reg_addr = 0, - .check_unit = rapl_check_unit_atom, - .set_floor_freq = NULL, - .compute_time_window = rapl_compute_time_window_atom, -}; - -static const struct rapl_defaults rapl_defaults_amd = { - .check_unit = rapl_default_check_unit, -}; - -static const struct x86_cpu_id rapl_ids[] __initconst = { - X86_MATCH_VFM(INTEL_SANDYBRIDGE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &rapl_defaults_core), - - X86_MATCH_VFM(INTEL_IVYBRIDGE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &rapl_defaults_core), - - X86_MATCH_VFM(INTEL_HASWELL, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_HASWELL_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_HASWELL_G, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_HASWELL_X, &rapl_defaults_hsw_server), - - X86_MATCH_VFM(INTEL_BROADWELL, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_BROADWELL_G, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_BROADWELL_D, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_BROADWELL_X, &rapl_defaults_hsw_server), - - X86_MATCH_VFM(INTEL_SKYLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_SKYLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_SKYLAKE_X, &rapl_defaults_hsw_server), - X86_MATCH_VFM(INTEL_KABYLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_KABYLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_CANNONLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ICELAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ICELAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ICELAKE_X, &rapl_defaults_hsw_server), - X86_MATCH_VFM(INTEL_ICELAKE_D, &rapl_defaults_hsw_server), - X86_MATCH_VFM(INTEL_COMETLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_COMETLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_TIGERLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_TIGERLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ROCKETLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ALDERLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ALDERLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_BARTLETTLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), - X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server), - X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_NOVALAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_NOVALAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_LAKEFIELD, &rapl_defaults_core), - - X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &rapl_defaults_cht), - X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &rapl_defaults_tng), - X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &rapl_defaults_ann), - X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_TREMONT, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &rapl_defaults_core), - - X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &rapl_defaults_hsw_server), - X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &rapl_defaults_hsw_server), - - X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd), - X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd), - X86_MATCH_VENDOR_FAM(AMD, 0x1A, &rapl_defaults_amd), - X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, rapl_ids); - /* Read once for all raw primitive data for domains */ static void rapl_update_domain_data(struct rapl_package *rp) { @@ -2266,40 +2067,13 @@ static struct notifier_block rapl_pm_notifier = { .notifier_call = rapl_pm_callback, }; -static struct platform_device *rapl_msr_platdev; - static int __init rapl_init(void) { - const struct x86_cpu_id *id; - int ret; - - id = x86_match_cpu(rapl_ids); - if (id) { - defaults_msr = (const struct rapl_defaults *)id->driver_data; - - rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0); - if (!rapl_msr_platdev) - return -ENOMEM; - - ret = platform_device_add(rapl_msr_platdev); - if (ret) { - platform_device_put(rapl_msr_platdev); - return ret; - } - } - - ret = register_pm_notifier(&rapl_pm_notifier); - if (ret && rapl_msr_platdev) { - platform_device_del(rapl_msr_platdev); - platform_device_put(rapl_msr_platdev); - } - - return ret; + return register_pm_notifier(&rapl_pm_notifier); } static void __exit rapl_exit(void) { - platform_device_unregister(rapl_msr_platdev); unregister_pm_notifier(&rapl_pm_notifier); } diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 9cd6a241df51..b7c10ed75d69 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -21,15 +21,33 @@ #include #include #include +#include +#include #include #include +#include #include /* Local defines */ #define MSR_PLATFORM_POWER_LIMIT 0x0000065C #define MSR_VR_CURRENT_CONFIG 0x00000601 +#define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */ + +#define POWER_UNIT_OFFSET 0x00 +#define POWER_UNIT_MASK GENMASK(3, 0) + +#define ENERGY_UNIT_OFFSET 0x08 +#define ENERGY_UNIT_MASK GENMASK(12, 8) + +#define TIME_UNIT_OFFSET 0x10 +#define TIME_UNIT_MASK GENMASK(19, 16) + +/* Sideband MBI registers */ +#define IOSF_CPU_POWER_BUDGET_CTL_BYT 0x02 +#define IOSF_CPU_POWER_BUDGET_CTL_TNG 0xDF + /* private data for RAPL MSR Interface */ static struct rapl_if_priv *rapl_msr_priv; @@ -185,6 +203,201 @@ static const struct x86_cpu_id pmu_support_ids[] = { {} }; +static int rapl_check_unit_atom(struct rapl_domain *rd) +{ + struct reg_action ra; + u32 value; + + ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; + ra.mask = ~0; + if (rapl_msr_read_raw(rd->rp->lead_cpu, &ra, false)) { + pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", + ra.reg.val, rd->rp->name, rd->name); + return -ENODEV; + } + + value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; + rd->energy_unit = ENERGY_UNIT_SCALE * (1ULL << value); + + value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; + rd->power_unit = (1ULL << value) * MILLIWATT_PER_WATT; + + value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; + rd->time_unit = USEC_PER_SEC >> value; + + pr_debug("Atom %s:%s energy=%dpJ, time=%dus, power=%duW\n", + rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); + + return 0; +} + +static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) +{ + static u32 power_ctrl_orig_val; + const struct rapl_defaults *defaults = rd->rp->priv->defaults; + u32 mdata; + + if (!defaults->floor_freq_reg_addr) { + pr_err("Invalid floor frequency config register\n"); + return; + } + + if (!power_ctrl_orig_val) + iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ, + defaults->floor_freq_reg_addr, + &power_ctrl_orig_val); + mdata = power_ctrl_orig_val; + if (enable) { + mdata &= ~GENMASK(14, 8); + mdata |= BIT(8); + } + iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE, + defaults->floor_freq_reg_addr, mdata); +} + +static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value, + bool to_raw) +{ + if (to_raw) + return div64_u64(value, rd->time_unit); + + /* + * Atom time unit encoding is straight forward val * time_unit, + * where time_unit is default to 1 sec. Never 0. + */ + return value ? value * rd->time_unit : rd->time_unit; +} + +static const struct rapl_defaults rapl_defaults_core = { + .floor_freq_reg_addr = 0, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, +}; + +static const struct rapl_defaults rapl_defaults_hsw_server = { + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, + .dram_domain_energy_unit = 15300, +}; + +static const struct rapl_defaults rapl_defaults_spr_server = { + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, + .psys_domain_energy_unit = NANOJOULE_PER_JOULE, + .spr_psys_bits = true, +}; + +static const struct rapl_defaults rapl_defaults_byt = { + .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT, + .check_unit = rapl_check_unit_atom, + .set_floor_freq = set_floor_freq_atom, + .compute_time_window = rapl_compute_time_window_atom, +}; + +static const struct rapl_defaults rapl_defaults_tng = { + .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG, + .check_unit = rapl_check_unit_atom, + .set_floor_freq = set_floor_freq_atom, + .compute_time_window = rapl_compute_time_window_atom, +}; + +static const struct rapl_defaults rapl_defaults_ann = { + .floor_freq_reg_addr = 0, + .check_unit = rapl_check_unit_atom, + .set_floor_freq = NULL, + .compute_time_window = rapl_compute_time_window_atom, +}; + +static const struct rapl_defaults rapl_defaults_cht = { + .floor_freq_reg_addr = 0, + .check_unit = rapl_check_unit_atom, + .set_floor_freq = NULL, + .compute_time_window = rapl_compute_time_window_atom, +}; + +static const struct rapl_defaults rapl_defaults_amd = { + .check_unit = rapl_default_check_unit, +}; + +static const struct x86_cpu_id rapl_ids[] = { + X86_MATCH_VFM(INTEL_SANDYBRIDGE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_IVYBRIDGE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_HASWELL, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_HASWELL_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_HASWELL_G, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_HASWELL_X, &rapl_defaults_hsw_server), + + X86_MATCH_VFM(INTEL_BROADWELL, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BROADWELL_G, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BROADWELL_D, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BROADWELL_X, &rapl_defaults_hsw_server), + + X86_MATCH_VFM(INTEL_SKYLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SKYLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SKYLAKE_X, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_KABYLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_KABYLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_CANNONLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE_X, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_ICELAKE_D, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_COMETLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_COMETLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_TIGERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_TIGERLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ROCKETLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ALDERLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BARTLETTLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server), + X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_NOVALAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_NOVALAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_LAKEFIELD, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &rapl_defaults_cht), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &rapl_defaults_tng), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &rapl_defaults_ann), + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_TREMONT, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &rapl_defaults_core), + + X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &rapl_defaults_hsw_server), + X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &rapl_defaults_hsw_server), + + X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd), + X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd), + X86_MATCH_VENDOR_FAM(AMD, 0x1A, &rapl_defaults_amd), + X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, rapl_ids); + static int rapl_msr_probe(struct platform_device *pdev) { const struct x86_cpu_id *id = x86_match_cpu(pl4_support_ids); @@ -204,6 +417,7 @@ static int rapl_msr_probe(struct platform_device *pdev) } rapl_msr_priv->read_raw = rapl_msr_read_raw; rapl_msr_priv->write_raw = rapl_msr_write_raw; + rapl_msr_priv->defaults = (const struct rapl_defaults *)pdev->dev.platform_data; if (id) { rapl_msr_priv->limits[RAPL_DOMAIN_PACKAGE] |= BIT(POWER_LIMIT4); @@ -258,7 +472,41 @@ static struct platform_driver intel_rapl_msr_driver = { }, }; -module_platform_driver(intel_rapl_msr_driver); +static struct platform_device *rapl_msr_platdev; + +static int intel_rapl_msr_init(void) +{ + const struct rapl_defaults *def; + const struct x86_cpu_id *id; + int ret; + + ret = platform_driver_register(&intel_rapl_msr_driver); + if (ret) + return ret; + + /* Create the MSR RAPL platform device for supported platforms */ + id = x86_match_cpu(rapl_ids); + if (!id) + return 0; + + def = (const struct rapl_defaults *)id->driver_data; + + rapl_msr_platdev = platform_device_register_data(NULL, "intel_rapl_msr", 0, def, + sizeof(*def)); + if (IS_ERR(rapl_msr_platdev)) + pr_debug("intel_rapl_msr device register failed, ret:%ld\n", + PTR_ERR(rapl_msr_platdev)); + + return 0; +} +module_init(intel_rapl_msr_init); + +static void intel_rapl_msr_exit(void) +{ + platform_device_unregister(rapl_msr_platdev); + platform_driver_unregister(&intel_rapl_msr_driver); +} +module_exit(intel_rapl_msr_exit); MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit) control via MSR interface"); MODULE_AUTHOR("Zhang Rui "); -- cgit v1.2.3 From 3e6996c0cbdbc32cfef4646660b994c1e0d96387 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:45 -0700 Subject: powercap: intel_rapl: Remove unused macro definitions Remove the following unused macro definitions from the RAPL common driver: * DOMAIN_STATE_INACTIVE and DOMAIN_STATE_POWER_LIMIT_SET * IOSF_CPU_POWER_BUDGET_CTL_BYT and IOSF_CPU_POWER_BUDGET_CTL_TNG * MAX_PRIM_NAME No functional changes. Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-3-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 1e8146283009..f2637cc2cc6a 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -96,15 +96,7 @@ /* per domain data, some are optional */ #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2) -#define DOMAIN_STATE_INACTIVE BIT(0) -#define DOMAIN_STATE_POWER_LIMIT_SET BIT(1) - -/* Sideband MBI registers */ -#define IOSF_CPU_POWER_BUDGET_CTL_BYT 0x02 -#define IOSF_CPU_POWER_BUDGET_CTL_TNG 0xDF - #define PACKAGE_PLN_INT_SAVED BIT(0) -#define MAX_PRIM_NAME 32 #define RAPL_EVENT_MASK GENMASK(7, 0) -- cgit v1.2.3 From 04bcbed4cd33495d05ba98857a748e416ab603b7 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:46 -0700 Subject: powercap: intel_rapl: Move primitive info to header for interface drivers RAPL primitive information varies across different RAPL interfaces (MSR, TPMI, MMIO). Keeping them in the common code adds no benefit, but requires interface-specific handling logic and makes the common layer unnecessarily complex. Move the primitive info infrastructure to the shared header to allow interface drivers to configure RAPL primitives. Specific changes: 1. Move struct rapl_primitive_info, enum unit_type, and PRIMITIVE_INFO_INIT macro to intel_rapl.h. 2. Change the @rpi field in struct rapl_if_priv from void * to struct rapl_primitive_info * to improve type safety and eliminate unnecessary casts. No functional changes. This is a preparatory refactoring to allow interface drivers to supply their own RAPL primitive settings. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-4-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 32 ++------------------------------ include/linux/intel_rapl.h | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index f2637cc2cc6a..ffc9d0378257 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -100,13 +100,6 @@ #define RAPL_EVENT_MASK GENMASK(7, 0) -enum unit_type { - ARBITRARY_UNIT, /* no translation */ - POWER_UNIT, - ENERGY_UNIT, - TIME_UNIT, -}; - static const char *pl_names[NR_POWER_LIMITS] = { [POWER_LIMIT1] = "long_term", [POWER_LIMIT2] = "short_term", @@ -208,27 +201,6 @@ static const struct rapl_defaults *get_defaults(struct rapl_package *rp) return rp->priv->defaults; } -/* per domain data. used to describe individual knobs such that access function - * can be consolidated into one instead of many inline functions. - */ -struct rapl_primitive_info { - const char *name; - u64 mask; - int shift; - enum rapl_domain_reg_id id; - enum unit_type unit; - u32 flag; -}; - -#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ - .name = #p, \ - .mask = m, \ - .shift = s, \ - .id = i, \ - .unit = u, \ - .flag = f \ - } - static void rapl_init_domains(struct rapl_package *rp); static int rapl_read_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, @@ -748,10 +720,10 @@ static int rapl_config(struct rapl_package *rp) /* MMIO I/F shares the same register layout as MSR registers */ case RAPL_IF_MMIO: case RAPL_IF_MSR: - rp->priv->rpi = (void *)rpi_msr; + rp->priv->rpi = rpi_msr; break; case RAPL_IF_TPMI: - rp->priv->rpi = (void *)rpi_tpmi; + rp->priv->rpi = rpi_tpmi; break; default: return -EINVAL; diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 9e6bd654be1f..01f290de3586 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -137,6 +137,34 @@ struct rapl_defaults { bool spr_psys_bits; }; +#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ + .name = #p, \ + .mask = m, \ + .shift = s, \ + .id = i, \ + .unit = u, \ + .flag = f \ + } + +enum unit_type { + ARBITRARY_UNIT, /* no translation */ + POWER_UNIT, + ENERGY_UNIT, + TIME_UNIT, +}; + +/* per domain data. used to describe individual knobs such that access function + * can be consolidated into one instead of many inline functions. + */ +struct rapl_primitive_info { + const char *name; + u64 mask; + int shift; + enum rapl_domain_reg_id id; + enum unit_type unit; + u32 flag; +}; + /** * struct rapl_if_priv: private data for different RAPL interfaces * @control_type: Each RAPL interface must have its own powercap @@ -152,7 +180,7 @@ struct rapl_defaults { * @write_raw: Callback for writing RAPL interface specific * registers. * @defaults: pointer to default settings - * @rpi: internal pointer to interface primitive info + * @rpi: pointer to interface primitive info */ struct rapl_if_priv { enum rapl_if_type type; @@ -164,7 +192,7 @@ struct rapl_if_priv { int (*read_raw)(int id, struct reg_action *ra, bool pmu_ctx); int (*write_raw)(int id, struct reg_action *ra); const struct rapl_defaults *defaults; - void *rpi; + struct rapl_primitive_info *rpi; }; #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3 From b874996a7c54fbcf684003442936a9711e353482 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:47 -0700 Subject: powercap: intel_rapl: Move TPMI primitives to TPMI driver TPMI-specific RAPL primitives differ from those used by MSR and MMIO interfaces. Keeping them in the common RAPL driver requires interface-specific handling logic and makes the common layer unnecessarily complex. Move the TPMI primitive definitions and associated bitmasks into the TPMI interface driver. This change includes: 1. Move TPMI-specific bitmask definitions from intel_rapl_common.c to intel_rapl_tpmi.c. 2. Add TPMI-local struct rapl_primitive_info instance and assign it to priv->rpi during TPMI probe. 3. Remove the RAPL TPMI related definitions from the common driver. No functional changes are intended. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-5-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 51 ---------------------------------- drivers/powercap/intel_rapl_tpmi.c | 53 ++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 51 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index ffc9d0378257..06912cb805f7 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -79,15 +79,6 @@ #define PSYS_TIME_WINDOW1_MASK GENMASK_ULL(25, 19) #define PSYS_TIME_WINDOW2_MASK GENMASK_ULL(57, 51) -/* bitmasks for RAPL TPMI, used by primitive access functions */ -#define TPMI_POWER_LIMIT_MASK GENMASK_ULL(17, 0) -#define TPMI_POWER_LIMIT_ENABLE BIT_ULL(62) -#define TPMI_TIME_WINDOW_MASK GENMASK_ULL(24, 18) -#define TPMI_INFO_SPEC_MASK GENMASK_ULL(17, 0) -#define TPMI_INFO_MIN_MASK GENMASK_ULL(35, 18) -#define TPMI_INFO_MAX_MASK GENMASK_ULL(53, 36) -#define TPMI_INFO_MAX_TIME_WIN_MASK GENMASK_ULL(60, 54) - /* Non HW constants */ #define RAPL_PRIMITIVE_DUMMY BIT(2) @@ -665,45 +656,6 @@ static struct rapl_primitive_info rpi_msr[NR_RAPL_PRIMITIVES] = { 51, RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), }; -/* RAPL primitives for TPMI I/F */ -static struct rapl_primitive_info rpi_tpmi[NR_RAPL_PRIMITIVES] = { - /* name, mask, shift, msr index, unit divisor */ - [POWER_LIMIT1] = PRIMITIVE_INFO_INIT(POWER_LIMIT1, TPMI_POWER_LIMIT_MASK, 0, - RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), - [POWER_LIMIT2] = PRIMITIVE_INFO_INIT(POWER_LIMIT2, TPMI_POWER_LIMIT_MASK, 0, - RAPL_DOMAIN_REG_PL2, POWER_UNIT, 0), - [POWER_LIMIT4] = PRIMITIVE_INFO_INIT(POWER_LIMIT4, TPMI_POWER_LIMIT_MASK, 0, - RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), - [ENERGY_COUNTER] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, - RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), - [PL1_LOCK] = PRIMITIVE_INFO_INIT(PL1_LOCK, POWER_HIGH_LOCK, 63, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL2_LOCK] = PRIMITIVE_INFO_INIT(PL2_LOCK, POWER_HIGH_LOCK, 63, - RAPL_DOMAIN_REG_PL2, ARBITRARY_UNIT, 0), - [PL4_LOCK] = PRIMITIVE_INFO_INIT(PL4_LOCK, POWER_HIGH_LOCK, 63, - RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0), - [PL1_ENABLE] = PRIMITIVE_INFO_INIT(PL1_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL2_ENABLE] = PRIMITIVE_INFO_INIT(PL2_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, - RAPL_DOMAIN_REG_PL2, ARBITRARY_UNIT, 0), - [PL4_ENABLE] = PRIMITIVE_INFO_INIT(PL4_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, - RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0), - [TIME_WINDOW1] = PRIMITIVE_INFO_INIT(TIME_WINDOW1, TPMI_TIME_WINDOW_MASK, 18, - RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), - [TIME_WINDOW2] = PRIMITIVE_INFO_INIT(TIME_WINDOW2, TPMI_TIME_WINDOW_MASK, 18, - RAPL_DOMAIN_REG_PL2, TIME_UNIT, 0), - [THERMAL_SPEC_POWER] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, TPMI_INFO_SPEC_MASK, 0, - RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MAX_POWER] = PRIMITIVE_INFO_INIT(MAX_POWER, TPMI_INFO_MAX_MASK, 36, - RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MIN_POWER] = PRIMITIVE_INFO_INIT(MIN_POWER, TPMI_INFO_MIN_MASK, 18, - RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MAX_TIME_WINDOW] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, TPMI_INFO_MAX_TIME_WIN_MASK, - 54, RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), - [THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, - 0, RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), -}; - static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim) { struct rapl_primitive_info *rpi = rp->priv->rpi; @@ -722,9 +674,6 @@ static int rapl_config(struct rapl_package *rp) case RAPL_IF_MSR: rp->priv->rpi = rpi_msr; break; - case RAPL_IF_TPMI: - rp->priv->rpi = rpi_tpmi; - break; default: return -EINVAL; } diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index c06d687366fc..7f41491d9cd1 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -62,6 +62,58 @@ static DEFINE_MUTEX(tpmi_rapl_lock); static struct powercap_control_type *tpmi_control_type; +/* bitmasks for RAPL TPMI, used by primitive access functions */ +#define TPMI_POWER_LIMIT_MASK GENMASK_ULL(17, 0) +#define TPMI_POWER_LIMIT_ENABLE BIT_ULL(62) +#define TPMI_POWER_HIGH_LOCK BIT_ULL(63) +#define TPMI_TIME_WINDOW_MASK GENMASK_ULL(24, 18) +#define TPMI_INFO_SPEC_MASK GENMASK_ULL(17, 0) +#define TPMI_INFO_MIN_MASK GENMASK_ULL(35, 18) +#define TPMI_INFO_MAX_MASK GENMASK_ULL(53, 36) +#define TPMI_INFO_MAX_TIME_WIN_MASK GENMASK_ULL(60, 54) +#define TPMI_ENERGY_STATUS_MASK GENMASK(31, 0) +#define TPMI_PERF_STATUS_THROTTLE_TIME_MASK GENMASK(31, 0) + +/* RAPL primitives for TPMI I/F */ +static struct rapl_primitive_info rpi_tpmi[NR_RAPL_PRIMITIVES] = { + /* name, mask, shift, msr index, unit divisor */ + [POWER_LIMIT1] = PRIMITIVE_INFO_INIT(POWER_LIMIT1, TPMI_POWER_LIMIT_MASK, 0, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [POWER_LIMIT2] = PRIMITIVE_INFO_INIT(POWER_LIMIT2, TPMI_POWER_LIMIT_MASK, 0, + RAPL_DOMAIN_REG_PL2, POWER_UNIT, 0), + [POWER_LIMIT4] = PRIMITIVE_INFO_INIT(POWER_LIMIT4, TPMI_POWER_LIMIT_MASK, 0, + RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), + [ENERGY_COUNTER] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER, TPMI_ENERGY_STATUS_MASK, 0, + RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), + [PL1_LOCK] = PRIMITIVE_INFO_INIT(PL1_LOCK, TPMI_POWER_HIGH_LOCK, 63, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL2_LOCK] = PRIMITIVE_INFO_INIT(PL2_LOCK, TPMI_POWER_HIGH_LOCK, 63, + RAPL_DOMAIN_REG_PL2, ARBITRARY_UNIT, 0), + [PL4_LOCK] = PRIMITIVE_INFO_INIT(PL4_LOCK, TPMI_POWER_HIGH_LOCK, 63, + RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0), + [PL1_ENABLE] = PRIMITIVE_INFO_INIT(PL1_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL2_ENABLE] = PRIMITIVE_INFO_INIT(PL2_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, + RAPL_DOMAIN_REG_PL2, ARBITRARY_UNIT, 0), + [PL4_ENABLE] = PRIMITIVE_INFO_INIT(PL4_ENABLE, TPMI_POWER_LIMIT_ENABLE, 62, + RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0), + [TIME_WINDOW1] = PRIMITIVE_INFO_INIT(TIME_WINDOW1, TPMI_TIME_WINDOW_MASK, 18, + RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + [TIME_WINDOW2] = PRIMITIVE_INFO_INIT(TIME_WINDOW2, TPMI_TIME_WINDOW_MASK, 18, + RAPL_DOMAIN_REG_PL2, TIME_UNIT, 0), + [THERMAL_SPEC_POWER] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, TPMI_INFO_SPEC_MASK, 0, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MAX_POWER] = PRIMITIVE_INFO_INIT(MAX_POWER, TPMI_INFO_MAX_MASK, 36, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MIN_POWER] = PRIMITIVE_INFO_INIT(MIN_POWER, TPMI_INFO_MIN_MASK, 18, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MAX_TIME_WINDOW] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, TPMI_INFO_MAX_TIME_WIN_MASK, + 54, RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), + [THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, + TPMI_PERF_STATUS_THROTTLE_TIME_MASK, + 0, RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), +}; + static int tpmi_rapl_read_raw(int id, struct reg_action *ra, bool atomic) { if (!ra->reg.mmio) @@ -344,6 +396,7 @@ static int intel_rapl_tpmi_probe(struct auxiliary_device *auxdev, trp->priv.write_raw = tpmi_rapl_write_raw; trp->priv.control_type = tpmi_control_type; trp->priv.defaults = &defaults_tpmi; + trp->priv.rpi = rpi_tpmi; /* RAPL TPMI I/F is per physical package */ trp->rp = rapl_find_package_domain(info->package_id, &trp->priv, false); -- cgit v1.2.3 From d7a718fff34b8b4a2d54672f2c685aef7a281b5e Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:48 -0700 Subject: thermal: intel: int340x: processor: Move MMIO primitives to MMIO driver MMIO-specific primitives differ from those used by the TPMI interface. The MSR and MMIO interfaces shared the same primitives in the common driver, but MMIO does not require many MSR-specific entries (like PSYS). Keeping these in the common driver does not add any value and requires interface-specific handling logic that makes the common layer unnecessarily complex. Move the MMIO primitive definitions and associated bitmasks into the MMIO interface driver. This change includes: 1. Add MMIO-local struct rapl_primitive_info instance without MSR-specific entries and assign it to priv->rpi during MMIO initialization. 2. Remove the RAPL MMIO case from rapl_config() in the common driver. No functional changes are intended. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-6-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 1 - .../intel/int340x_thermal/processor_thermal_rapl.c | 72 ++++++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 06912cb805f7..7c5e16598ba3 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -670,7 +670,6 @@ static int rapl_config(struct rapl_package *rp) { switch (rp->priv->type) { /* MMIO I/F shares the same register layout as MSR registers */ - case RAPL_IF_MMIO: case RAPL_IF_MSR: rp->priv->rpi = rpi_msr; break; diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c index 5dbeb0a43c8c..f8b9745c1b8a 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c @@ -11,6 +11,77 @@ static struct rapl_if_priv rapl_mmio_priv; +/* bitmasks for RAPL MSRs, used by primitive access functions */ +#define MMIO_ENERGY_STATUS_MASK GENMASK(31, 0) + +#define MMIO_POWER_LIMIT1_MASK GENMASK(14, 0) +#define MMIO_POWER_LIMIT1_ENABLE BIT(15) +#define MMIO_POWER_LIMIT1_CLAMP BIT(16) + +#define MMIO_POWER_LIMIT2_MASK GENMASK_ULL(46, 32) +#define MMIO_POWER_LIMIT2_ENABLE BIT_ULL(47) +#define MMIO_POWER_LIMIT2_CLAMP BIT_ULL(48) + +#define MMIO_POWER_LOW_LOCK BIT(31) +#define MMIO_POWER_HIGH_LOCK BIT_ULL(63) + +#define MMIO_POWER_LIMIT4_MASK GENMASK(12, 0) + +#define MMIO_TIME_WINDOW1_MASK GENMASK_ULL(23, 17) +#define MMIO_TIME_WINDOW2_MASK GENMASK_ULL(55, 49) + +#define MMIO_POWER_INFO_MAX_MASK GENMASK_ULL(46, 32) +#define MMIO_POWER_INFO_MIN_MASK GENMASK_ULL(30, 16) +#define MMIO_POWER_INFO_MAX_TIME_WIN_MASK GENMASK_ULL(53, 48) +#define MMIO_POWER_INFO_THERMAL_SPEC_MASK GENMASK(14, 0) + +#define MMIO_PERF_STATUS_THROTTLE_TIME_MASK GENMASK(31, 0) +#define MMIO_PP_POLICY_MASK GENMASK(4, 0) + +/* RAPL primitives for MMIO I/F */ +static struct rapl_primitive_info rpi_mmio[NR_RAPL_PRIMITIVES] = { + /* name, mask, shift, msr index, unit divisor */ + [POWER_LIMIT1] = PRIMITIVE_INFO_INIT(POWER_LIMIT1, MMIO_POWER_LIMIT1_MASK, 0, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [POWER_LIMIT2] = PRIMITIVE_INFO_INIT(POWER_LIMIT2, MMIO_POWER_LIMIT2_MASK, 32, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [POWER_LIMIT4] = PRIMITIVE_INFO_INIT(POWER_LIMIT4, MMIO_POWER_LIMIT4_MASK, 0, + RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), + [ENERGY_COUNTER] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER, MMIO_ENERGY_STATUS_MASK, 0, + RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), + [FW_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, MMIO_POWER_LOW_LOCK, 31, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [FW_HIGH_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, MMIO_POWER_HIGH_LOCK, 63, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL1_ENABLE] = PRIMITIVE_INFO_INIT(PL1_ENABLE, MMIO_POWER_LIMIT1_ENABLE, 15, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL1_CLAMP] = PRIMITIVE_INFO_INIT(PL1_CLAMP, MMIO_POWER_LIMIT1_CLAMP, 16, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL2_ENABLE] = PRIMITIVE_INFO_INIT(PL2_ENABLE, MMIO_POWER_LIMIT2_ENABLE, 47, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL2_CLAMP] = PRIMITIVE_INFO_INIT(PL2_CLAMP, MMIO_POWER_LIMIT2_CLAMP, 48, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [TIME_WINDOW1] = PRIMITIVE_INFO_INIT(TIME_WINDOW1, MMIO_TIME_WINDOW1_MASK, 17, + RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + [TIME_WINDOW2] = PRIMITIVE_INFO_INIT(TIME_WINDOW2, MMIO_TIME_WINDOW2_MASK, 49, + RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + [THERMAL_SPEC_POWER] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, + MMIO_POWER_INFO_THERMAL_SPEC_MASK, 0, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MAX_POWER] = PRIMITIVE_INFO_INIT(MAX_POWER, MMIO_POWER_INFO_MAX_MASK, 32, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MIN_POWER] = PRIMITIVE_INFO_INIT(MIN_POWER, MMIO_POWER_INFO_MIN_MASK, 16, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MAX_TIME_WINDOW] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, + MMIO_POWER_INFO_MAX_TIME_WIN_MASK, 48, + RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), + [THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, + MMIO_PERF_STATUS_THROTTLE_TIME_MASK, 0, + RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), + [PRIORITY_LEVEL] = PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, MMIO_PP_POLICY_MASK, 0, + RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0), +}; + static const struct rapl_mmio_regs rapl_mmio_default = { .reg_unit = 0x5938, .regs[RAPL_DOMAIN_PACKAGE] = { 0x59a0, 0x593c, 0x58f0, 0, 0x5930, 0x59b0}, @@ -75,6 +146,7 @@ int proc_thermal_rapl_add(struct pci_dev *pdev, struct proc_thermal_device *proc rapl_mmio_priv.read_raw = rapl_mmio_read_raw; rapl_mmio_priv.write_raw = rapl_mmio_write_raw; rapl_mmio_priv.defaults = &rapl_defaults_mmio; + rapl_mmio_priv.rpi = rpi_mmio; rapl_mmio_priv.control_type = powercap_register_control_type(NULL, "intel-rapl-mmio", NULL); if (IS_ERR(rapl_mmio_priv.control_type)) { -- cgit v1.2.3 From b0ee5110ef1c684eab41bd03fbe8bf2a8f964054 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:49 -0700 Subject: powercap: intel_rapl: Move MSR primitives to MSR driver MSR-specific RAPL primitives differ from those used by TPMI and MMIO interfaces. Keeping them in the common driver requires interface-specific handling logic and makes the common layer unnecessarily complex. Move the MSR primitive definitions and associated bitmasks into the MSR interface driver. This change includes: 1. Move MSR-specific bitmask definitions to RAPL MSR driver. 2. Add MSR-local struct rapl_primitive_info instance and assign it to priv->rpi during MSR probe. 3. Remove the primitive assignment logic from rapl_config() in the common driver. No functional changes are intended. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-7-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 105 ----------------------------------- drivers/powercap/intel_rapl_msr.c | 99 +++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 105 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 7c5e16598ba3..a8dd02dff0a0 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -30,24 +30,8 @@ #include #include -/* bitmasks for RAPL MSRs, used by primitive access functions */ #define ENERGY_STATUS_MASK GENMASK(31, 0) -#define POWER_LIMIT1_MASK GENMASK(14, 0) -#define POWER_LIMIT1_ENABLE BIT(15) -#define POWER_LIMIT1_CLAMP BIT(16) - -#define POWER_LIMIT2_MASK GENMASK_ULL(46, 32) -#define POWER_LIMIT2_ENABLE BIT_ULL(47) -#define POWER_LIMIT2_CLAMP BIT_ULL(48) -#define POWER_HIGH_LOCK BIT_ULL(63) -#define POWER_LOW_LOCK BIT(31) - -#define POWER_LIMIT4_MASK GENMASK(12, 0) - -#define TIME_WINDOW1_MASK GENMASK_ULL(23, 17) -#define TIME_WINDOW2_MASK GENMASK_ULL(55, 49) - #define POWER_UNIT_OFFSET 0x00 #define POWER_UNIT_MASK GENMASK(3, 0) @@ -57,28 +41,6 @@ #define TIME_UNIT_OFFSET 0x10 #define TIME_UNIT_MASK GENMASK(19, 16) -#define POWER_INFO_MAX_MASK GENMASK_ULL(46, 32) -#define POWER_INFO_MIN_MASK GENMASK_ULL(30, 16) -#define POWER_INFO_MAX_TIME_WIN_MASK GENMASK_ULL(53, 48) -#define POWER_INFO_THERMAL_SPEC_MASK GENMASK(14, 0) - -#define PERF_STATUS_THROTTLE_TIME_MASK GENMASK(31, 0) -#define PP_POLICY_MASK GENMASK(4, 0) - -/* - * SPR has different layout for Psys Domain PowerLimit registers. - * There are 17 bits of PL1 and PL2 instead of 15 bits. - * The Enable bits and TimeWindow bits are also shifted as a result. - */ -#define PSYS_POWER_LIMIT1_MASK GENMASK_ULL(16, 0) -#define PSYS_POWER_LIMIT1_ENABLE BIT(17) - -#define PSYS_POWER_LIMIT2_MASK GENMASK_ULL(48, 32) -#define PSYS_POWER_LIMIT2_ENABLE BIT_ULL(49) - -#define PSYS_TIME_WINDOW1_MASK GENMASK_ULL(25, 19) -#define PSYS_TIME_WINDOW2_MASK GENMASK_ULL(57, 51) - /* Non HW constants */ #define RAPL_PRIMITIVE_DUMMY BIT(2) @@ -598,64 +560,6 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, return div64_u64(value, scale); } -/* RAPL primitives for MSR and MMIO I/F */ -static struct rapl_primitive_info rpi_msr[NR_RAPL_PRIMITIVES] = { - /* name, mask, shift, msr index, unit divisor */ - [POWER_LIMIT1] = PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0, - RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), - [POWER_LIMIT2] = PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32, - RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), - [POWER_LIMIT4] = PRIMITIVE_INFO_INIT(POWER_LIMIT4, POWER_LIMIT4_MASK, 0, - RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), - [ENERGY_COUNTER] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, - RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), - [FW_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [FW_HIGH_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, POWER_HIGH_LOCK, 63, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL1_ENABLE] = PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL1_CLAMP] = PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL2_ENABLE] = PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [PL2_CLAMP] = PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48, - RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), - [TIME_WINDOW1] = PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17, - RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), - [TIME_WINDOW2] = PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49, - RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), - [THERMAL_SPEC_POWER] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, - POWER_INFO_THERMAL_SPEC_MASK, 0, - RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MAX_POWER] = PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32, - RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MIN_POWER] = PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16, - RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), - [MAX_TIME_WINDOW] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, - POWER_INFO_MAX_TIME_WIN_MASK, 48, - RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), - [THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, - PERF_STATUS_THROTTLE_TIME_MASK, 0, - RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), - [PRIORITY_LEVEL] = PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0, - RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0), - [PSYS_POWER_LIMIT1] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0, - RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), - [PSYS_POWER_LIMIT2] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, - 32, RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), - [PSYS_PL1_ENABLE] = PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, - 17, RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, - 0), - [PSYS_PL2_ENABLE] = PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, - 49, RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, - 0), - [PSYS_TIME_WINDOW1] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, - 19, RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), - [PSYS_TIME_WINDOW2] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, - 51, RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), -}; - static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim) { struct rapl_primitive_info *rpi = rp->priv->rpi; @@ -668,15 +572,6 @@ static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim) static int rapl_config(struct rapl_package *rp) { - switch (rp->priv->type) { - /* MMIO I/F shares the same register layout as MSR registers */ - case RAPL_IF_MSR: - rp->priv->rpi = rpi_msr; - break; - default: - return -EINVAL; - } - /* defaults_msr can be NULL on unsupported platforms */ if (!rp->priv->defaults || !rp->priv->rpi) return -ENODEV; diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index b7c10ed75d69..cfb35973f0b5 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -44,6 +44,46 @@ #define TIME_UNIT_OFFSET 0x10 #define TIME_UNIT_MASK GENMASK(19, 16) +/* bitmasks for RAPL MSRs, used by primitive access functions */ +#define ENERGY_STATUS_MASK GENMASK(31, 0) + +#define POWER_LIMIT1_MASK GENMASK(14, 0) +#define POWER_LIMIT1_ENABLE BIT(15) +#define POWER_LIMIT1_CLAMP BIT(16) + +#define POWER_LIMIT2_MASK GENMASK_ULL(46, 32) +#define POWER_LIMIT2_ENABLE BIT_ULL(47) +#define POWER_LIMIT2_CLAMP BIT_ULL(48) +#define POWER_HIGH_LOCK BIT_ULL(63) +#define POWER_LOW_LOCK BIT(31) + +#define POWER_LIMIT4_MASK GENMASK(12, 0) + +#define TIME_WINDOW1_MASK GENMASK_ULL(23, 17) +#define TIME_WINDOW2_MASK GENMASK_ULL(55, 49) + +#define POWER_INFO_MAX_MASK GENMASK_ULL(46, 32) +#define POWER_INFO_MIN_MASK GENMASK_ULL(30, 16) +#define POWER_INFO_MAX_TIME_WIN_MASK GENMASK_ULL(53, 48) +#define POWER_INFO_THERMAL_SPEC_MASK GENMASK(14, 0) + +#define PERF_STATUS_THROTTLE_TIME_MASK GENMASK(31, 0) +#define PP_POLICY_MASK GENMASK(4, 0) + +/* + * SPR has different layout for Psys Domain PowerLimit registers. + * There are 17 bits of PL1 and PL2 instead of 15 bits. + * The Enable bits and TimeWindow bits are also shifted as a result. + */ +#define PSYS_POWER_LIMIT1_MASK GENMASK_ULL(16, 0) +#define PSYS_POWER_LIMIT1_ENABLE BIT(17) + +#define PSYS_POWER_LIMIT2_MASK GENMASK_ULL(48, 32) +#define PSYS_POWER_LIMIT2_ENABLE BIT_ULL(49) + +#define PSYS_TIME_WINDOW1_MASK GENMASK_ULL(25, 19) +#define PSYS_TIME_WINDOW2_MASK GENMASK_ULL(57, 51) + /* Sideband MBI registers */ #define IOSF_CPU_POWER_BUDGET_CTL_BYT 0x02 #define IOSF_CPU_POWER_BUDGET_CTL_TNG 0xDF @@ -268,6 +308,64 @@ static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value, return value ? value * rd->time_unit : rd->time_unit; } +/* RAPL primitives for MSR I/F */ +static struct rapl_primitive_info rpi_msr[NR_RAPL_PRIMITIVES] = { + /* name, mask, shift, msr index, unit divisor */ + [POWER_LIMIT1] = PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [POWER_LIMIT2] = PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [POWER_LIMIT4] = PRIMITIVE_INFO_INIT(POWER_LIMIT4, POWER_LIMIT4_MASK, 0, + RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0), + [ENERGY_COUNTER] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, + RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0), + [FW_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [FW_HIGH_LOCK] = PRIMITIVE_INFO_INIT(FW_LOCK, POWER_HIGH_LOCK, 63, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL1_ENABLE] = PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL1_CLAMP] = PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL2_ENABLE] = PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [PL2_CLAMP] = PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48, + RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0), + [TIME_WINDOW1] = PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17, + RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + [TIME_WINDOW2] = PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49, + RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + [THERMAL_SPEC_POWER] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, + POWER_INFO_THERMAL_SPEC_MASK, 0, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MAX_POWER] = PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MIN_POWER] = PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16, + RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0), + [MAX_TIME_WINDOW] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, + POWER_INFO_MAX_TIME_WIN_MASK, 48, + RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0), + [THROTTLED_TIME] = PRIMITIVE_INFO_INIT(THROTTLED_TIME, + PERF_STATUS_THROTTLE_TIME_MASK, 0, + RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0), + [PRIORITY_LEVEL] = PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0, + RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0), + [PSYS_POWER_LIMIT1] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0, + RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [PSYS_POWER_LIMIT2] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, + 32, RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0), + [PSYS_PL1_ENABLE] = PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, + 17, RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, + 0), + [PSYS_PL2_ENABLE] = PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, + 49, RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, + 0), + [PSYS_TIME_WINDOW1] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, + 19, RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), + [PSYS_TIME_WINDOW2] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, + 51, RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0), +}; + static const struct rapl_defaults rapl_defaults_core = { .floor_freq_reg_addr = 0, .check_unit = rapl_default_check_unit, @@ -418,6 +516,7 @@ static int rapl_msr_probe(struct platform_device *pdev) rapl_msr_priv->read_raw = rapl_msr_read_raw; rapl_msr_priv->write_raw = rapl_msr_write_raw; rapl_msr_priv->defaults = (const struct rapl_defaults *)pdev->dev.platform_data; + rapl_msr_priv->rpi = rpi_msr; if (id) { rapl_msr_priv->limits[RAPL_DOMAIN_PACKAGE] |= BIT(POWER_LIMIT4); -- cgit v1.2.3 From c3bb8d4f5d802ec1a16f018e82030bccb7a053a4 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 31 Mar 2026 14:19:50 -0700 Subject: powercap: intel_rapl: Consolidate PL4 and PMU support flags into rapl_defaults Currently, PL4 and MSR-based RAPL PMU support are detected using separate CPU ID tables (pl4_support_ids and pmu_support_ids) in the MSR driver probe path. This creates a maintenance burden since adding a new CPU requires updates in two places: the rapl_ids table and one or both of these capability tables. Consolidate PL4 and PMU capability information directly into struct rapl_defaults by adding msr_pl4_support and msr_pmu_support flags. This allows per-CPU capability to be expressed in a single place alongside other per-CPU defaults, eliminating the duplicate CPU ID tables entirely. No functional changes are intended. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Acked-by: Srinivas Pandruvada Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20260331211950.3329932-8-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_msr.c | 83 +++++++++++++++++---------------------- include/linux/intel_rapl.h | 2 + 2 files changed, 38 insertions(+), 47 deletions(-) diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index cfb35973f0b5..a34543e66446 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -216,33 +216,6 @@ static int rapl_msr_write_raw(int cpu, struct reg_action *ra) return ra->err; } -/* List of verified CPUs. */ -static const struct x86_cpu_id pl4_support_ids[] = { - X86_MATCH_VFM(INTEL_ICELAKE_L, NULL), - X86_MATCH_VFM(INTEL_TIGERLAKE_L, NULL), - X86_MATCH_VFM(INTEL_ALDERLAKE, NULL), - X86_MATCH_VFM(INTEL_ALDERLAKE_L, NULL), - X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, NULL), - X86_MATCH_VFM(INTEL_RAPTORLAKE, NULL), - X86_MATCH_VFM(INTEL_RAPTORLAKE_P, NULL), - X86_MATCH_VFM(INTEL_METEORLAKE, NULL), - X86_MATCH_VFM(INTEL_METEORLAKE_L, NULL), - X86_MATCH_VFM(INTEL_ARROWLAKE_U, NULL), - X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL), - X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), - X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), - X86_MATCH_VFM(INTEL_NOVALAKE, NULL), - X86_MATCH_VFM(INTEL_NOVALAKE_L, NULL), - {} -}; - -/* List of MSR-based RAPL PMU support CPUs */ -static const struct x86_cpu_id pmu_support_ids[] = { - X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), - X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), - {} -}; - static int rapl_check_unit_atom(struct rapl_domain *rd) { struct reg_action ra; @@ -420,6 +393,23 @@ static const struct rapl_defaults rapl_defaults_amd = { .check_unit = rapl_default_check_unit, }; +static const struct rapl_defaults rapl_defaults_core_pl4 = { + .floor_freq_reg_addr = 0, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, + .msr_pl4_support = 1, +}; + +static const struct rapl_defaults rapl_defaults_core_pl4_pmu = { + .floor_freq_reg_addr = 0, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, + .msr_pl4_support = 1, + .msr_pmu_support = 1, +}; + static const struct x86_cpu_id rapl_ids[] = { X86_MATCH_VFM(INTEL_SANDYBRIDGE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &rapl_defaults_core), @@ -443,35 +433,35 @@ static const struct x86_cpu_id rapl_ids[] = { X86_MATCH_VFM(INTEL_KABYLAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_KABYLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_CANNONLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ICELAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ICELAKE_L, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_ICELAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ICELAKE_X, &rapl_defaults_hsw_server), X86_MATCH_VFM(INTEL_ICELAKE_D, &rapl_defaults_hsw_server), X86_MATCH_VFM(INTEL_COMETLAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_COMETLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_TIGERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_TIGERLAKE_L, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_TIGERLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ROCKETLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ALDERLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ALDERLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ALDERLAKE, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &rapl_defaults_core), X86_MATCH_VFM(INTEL_BARTLETTLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_NOVALAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_NOVALAKE_L, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core_pl4_pmu), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core_pl4_pmu), + X86_MATCH_VFM(INTEL_NOVALAKE, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_NOVALAKE_L, &rapl_defaults_core_pl4), + X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), - X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core_pl4), X86_MATCH_VFM(INTEL_LAKEFIELD, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt), @@ -498,7 +488,6 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_ids); static int rapl_msr_probe(struct platform_device *pdev) { - const struct x86_cpu_id *id = x86_match_cpu(pl4_support_ids); int ret; switch (boot_cpu_data.x86_vendor) { @@ -518,16 +507,16 @@ static int rapl_msr_probe(struct platform_device *pdev) rapl_msr_priv->defaults = (const struct rapl_defaults *)pdev->dev.platform_data; rapl_msr_priv->rpi = rpi_msr; - if (id) { + if (rapl_msr_priv->defaults->msr_pl4_support) { rapl_msr_priv->limits[RAPL_DOMAIN_PACKAGE] |= BIT(POWER_LIMIT4); rapl_msr_priv->regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_PL4].msr = MSR_VR_CURRENT_CONFIG; - pr_info("PL4 support detected.\n"); + pr_info("PL4 support detected (updated).\n"); } - if (x86_match_cpu(pmu_support_ids)) { + if (rapl_msr_priv->defaults->msr_pmu_support) { rapl_msr_pmu = true; - pr_info("MSR-based RAPL PMU support enabled\n"); + pr_info("MSR-based RAPL PMU support enabled (updated)\n"); } rapl_msr_priv->control_type = powercap_register_control_type(NULL, "intel-rapl", NULL); diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 01f290de3586..328004f605c3 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -135,6 +135,8 @@ struct rapl_defaults { unsigned int dram_domain_energy_unit; unsigned int psys_domain_energy_unit; bool spr_psys_bits; + bool msr_pl4_support; + bool msr_pmu_support; }; #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ -- cgit v1.2.3 From beda3b363546a423e4e29a7395e04c0ac4ff677e Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:45 +0530 Subject: amd-pstate: Fix memory leak in amd_pstate_epp_cpu_init() On failure to set the epp, the function amd_pstate_epp_cpu_init() returns with an error code without freeing the cpudata object that was allocated at the beginning of the function. Ensure that the cpudata object is freed before returning from the function. This memory leak was discovered by Claude Opus 4.6 with the aid of Chris Mason's AI review-prompts (https://github.com/masoncl/review-prompts/tree/main/kernel). Assisted-by: Claude:claude-opus-4.6 review-prompts/linux Fixes: f9a378ff6443 ("cpufreq/amd-pstate: Set different default EPP policy for Epyc and Ryzen") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5aa9fcd80cf5..d57969c72c9d 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1533,7 +1533,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) ret = amd_pstate_set_epp(policy, cpudata->epp_default); if (ret) - return ret; + goto free_cpudata1; current_pstate_driver->adjust_perf = NULL; -- cgit v1.2.3 From fcc25a291fbdca2c06c2c6602532050873f0c9de Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:46 +0530 Subject: amd-pstate: Update cppc_req_cached in fast_switch case The function msr_update_perf() does not cache the new value that is written to MSR_AMD_CPPC_REQ into the variable cpudata->cppc_req_cached when the update is happening from the fast path. Fix that by caching the value everytime the MSR_AMD_CPPC_REQ gets updated. This issue was discovered by Claude Opus 4.6 with the aid of Chris Mason's AI review-prompts (https://github.com/masoncl/review-prompts/tree/main/kernel). Assisted-by: Claude:claude-opus-4.6 review-prompts/linux Reviewed-by: Mario Limonciello (AMD) Fixes: fff395796917 ("cpufreq/amd-pstate: Always write EPP value when updating perf") Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d57969c72c9d..24cdeffbcd40 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -261,7 +261,6 @@ static int msr_update_perf(struct cpufreq_policy *policy, u8 min_perf, if (fast_switch) { wrmsrq(MSR_AMD_CPPC_REQ, value); - return 0; } else { int ret = wrmsrq_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); -- cgit v1.2.3 From e67a5b6541831bbf1c40b6042a867a4594ec6b55 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:47 +0530 Subject: amd-pstate: Make certain freq_attrs conditionally visible Certain amd_pstate freq_attrs such as amd_pstate_hw_prefcore and amd_pstate_prefcore_ranking are enabled even when preferred core is not supported on the platform. Similarly there are common freq_attrs between the amd-pstate and the amd-pstate-epp drivers (eg: amd_pstate_max_freq, amd_pstate_lowest_nonlinear_freq, etc.) but are duplicated in two different freq_attr structs. Unify all the attributes in a single place and associate each of them with a visibility function that determines whether the attribute should be visible based on the underlying platform support and the current amd_pstate mode. Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 124 ++++++++++++++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 31 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 24cdeffbcd40..4de2037a414c 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1220,12 +1220,87 @@ static ssize_t show_energy_performance_preference( return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); } +cpufreq_freq_attr_ro(amd_pstate_max_freq); +cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); + +cpufreq_freq_attr_ro(amd_pstate_highest_perf); +cpufreq_freq_attr_ro(amd_pstate_prefcore_ranking); +cpufreq_freq_attr_ro(amd_pstate_hw_prefcore); +cpufreq_freq_attr_rw(energy_performance_preference); +cpufreq_freq_attr_ro(energy_performance_available_preferences); + +struct freq_attr_visibility { + struct freq_attr *attr; + bool (*visibility_fn)(void); +}; + +/* For attributes which are always visible */ +static bool always_visible(void) +{ + return true; +} + +/* Determines whether prefcore related attributes should be visible */ +static bool prefcore_visibility(void) +{ + return amd_pstate_prefcore; +} + +/* Determines whether energy performance preference should be visible */ +static bool epp_visibility(void) +{ + return cppc_state == AMD_PSTATE_ACTIVE; +} + +static struct freq_attr_visibility amd_pstate_attr_visibility[] = { + {&amd_pstate_max_freq, always_visible}, + {&amd_pstate_lowest_nonlinear_freq, always_visible}, + {&amd_pstate_highest_perf, always_visible}, + {&amd_pstate_prefcore_ranking, prefcore_visibility}, + {&amd_pstate_hw_prefcore, prefcore_visibility}, + {&energy_performance_preference, epp_visibility}, + {&energy_performance_available_preferences, epp_visibility}, +}; + +static struct freq_attr **get_freq_attrs(void) +{ + bool attr_visible[ARRAY_SIZE(amd_pstate_attr_visibility)]; + struct freq_attr **attrs; + int i, j, count; + + for (i = 0, count = 0; i < ARRAY_SIZE(amd_pstate_attr_visibility); i++) { + struct freq_attr_visibility *v = &amd_pstate_attr_visibility[i]; + + attr_visible[i] = v->visibility_fn(); + if (attr_visible[i]) + count++; + } + + /* amd_pstate_{max_freq, lowest_nonlinear_freq, highest_perf} should always be visible */ + BUG_ON(!count); + + attrs = kcalloc(count + 1, sizeof(struct freq_attr *), GFP_KERNEL); + if (!attrs) + return ERR_PTR(-ENOMEM); + + for (i = 0, j = 0; i < ARRAY_SIZE(amd_pstate_attr_visibility); i++) { + if (!attr_visible[i]) + continue; + + attrs[j++] = amd_pstate_attr_visibility[i].attr; + } + + return attrs; +} + static void amd_pstate_driver_cleanup(void) { if (amd_pstate_prefcore) sched_clear_itmt_support(); cppc_state = AMD_PSTATE_DISABLE; + kfree(current_pstate_driver->attr); + current_pstate_driver->attr = NULL; current_pstate_driver = NULL; } @@ -1250,6 +1325,7 @@ static int amd_pstate_set_driver(int mode_idx) static int amd_pstate_register_driver(int mode) { + struct freq_attr **attr = NULL; int ret; ret = amd_pstate_set_driver(mode); @@ -1258,6 +1334,22 @@ static int amd_pstate_register_driver(int mode) cppc_state = mode; + /* + * Note: It is important to compute the attrs _after_ + * re-initializing the cppc_state. Some attributes become + * visible only when cppc_state is AMD_PSTATE_ACTIVE. + */ + attr = get_freq_attrs(); + if (IS_ERR(attr)) { + ret = (int) PTR_ERR(attr); + pr_err("Couldn't compute freq_attrs for current mode %s [%d]\n", + amd_pstate_get_mode_string(cppc_state), ret); + amd_pstate_driver_cleanup(); + return ret; + } + + current_pstate_driver->attr = attr; + /* at least one CPU supports CPB */ current_pstate_driver->boost_enabled = cpu_feature_enabled(X86_FEATURE_CPB); @@ -1399,37 +1491,9 @@ static ssize_t prefcore_show(struct device *dev, return sysfs_emit(buf, "%s\n", str_enabled_disabled(amd_pstate_prefcore)); } -cpufreq_freq_attr_ro(amd_pstate_max_freq); -cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); - -cpufreq_freq_attr_ro(amd_pstate_highest_perf); -cpufreq_freq_attr_ro(amd_pstate_prefcore_ranking); -cpufreq_freq_attr_ro(amd_pstate_hw_prefcore); -cpufreq_freq_attr_rw(energy_performance_preference); -cpufreq_freq_attr_ro(energy_performance_available_preferences); static DEVICE_ATTR_RW(status); static DEVICE_ATTR_RO(prefcore); -static struct freq_attr *amd_pstate_attr[] = { - &amd_pstate_max_freq, - &amd_pstate_lowest_nonlinear_freq, - &amd_pstate_highest_perf, - &amd_pstate_prefcore_ranking, - &amd_pstate_hw_prefcore, - NULL, -}; - -static struct freq_attr *amd_pstate_epp_attr[] = { - &amd_pstate_max_freq, - &amd_pstate_lowest_nonlinear_freq, - &amd_pstate_highest_perf, - &amd_pstate_prefcore_ranking, - &amd_pstate_hw_prefcore, - &energy_performance_preference, - &energy_performance_available_preferences, - NULL, -}; - static struct attribute *pstate_global_attributes[] = { &dev_attr_status.attr, &dev_attr_prefcore.attr, @@ -1696,7 +1760,6 @@ static struct cpufreq_driver amd_pstate_driver = { .set_boost = amd_pstate_set_boost, .update_limits = amd_pstate_update_limits, .name = "amd-pstate", - .attr = amd_pstate_attr, }; static struct cpufreq_driver amd_pstate_epp_driver = { @@ -1712,7 +1775,6 @@ static struct cpufreq_driver amd_pstate_epp_driver = { .update_limits = amd_pstate_update_limits, .set_boost = amd_pstate_set_boost, .name = "amd-pstate-epp", - .attr = amd_pstate_epp_attr, }; /* @@ -1858,7 +1920,7 @@ static int __init amd_pstate_init(void) return ret; global_attr_free: - cpufreq_unregister_driver(current_pstate_driver); + amd_pstate_unregister_driver(0); return ret; } device_initcall(amd_pstate_init); -- cgit v1.2.3 From 172100088f9b131b88bcde70724485470c20e7d2 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:48 +0530 Subject: x86/cpufeatures: Add AMD CPPC Performance Priority feature. Some future AMD processors have feature named "CPPC Performance Priority" which lets userspace specify different floor performance levels for different CPUs. The platform firmware takes these different floor performance levels into consideration while throttling the CPUs under power/thermal constraints. The presence of this feature is indicated by bit 16 of the EDX register for CPUID leaf 0x80000007. More details can be found in AMD Publication titled "AMD64 Collaborative Processor Performance Control (CPPC) Performance Priority" Revision 1.10. Define a new feature bit named X86_FEATURE_CPPC_PERF_PRIO to map to CPUID 0x80000007.EDX[16]. Reviewed-by: Borislav Petkov (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c | 1 + tools/arch/x86/include/asm/cpufeatures.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index dbe104df339b..86d17b195e79 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -415,7 +415,7 @@ */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ #define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ - +#define X86_FEATURE_CPPC_PERF_PRIO (17*32+ 2) /* CPPC Floor Perf support */ #define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 42c7eac0c387..837d6a4b0c28 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -52,6 +52,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, + { X86_FEATURE_CPPC_PERF_PRIO, CPUID_EDX, 16, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 }, { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index dbe104df339b..86d17b195e79 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -415,7 +415,7 @@ */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ #define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ - +#define X86_FEATURE_CPPC_PERF_PRIO (17*32+ 2) /* CPPC Floor Perf support */ #define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ -- cgit v1.2.3 From 97838281f587a9e98e74b913201f7408214b5999 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:49 +0530 Subject: amd-pstate: Add support for CPPC_REQ2 and FLOOR_PERF Some future AMD processors have feature named "CPPC Performance Priority" which lets userspace specify different floor performance levels for different CPUs. The platform firmware takes these different floor performance levels into consideration while throttling the CPUs under power/thermal constraints. The presence of this feature is indicated by bit 16 of the EDX register for CPUID leaf 0x80000007. More details can be found in AMD Publication titled "AMD64 Collaborative Processor Performance Control (CPPC) Performance Priority" Revision 1.10. The number of distinct floor performance levels supported on the platform will be advertised through the bits 32:39 of the MSR_AMD_CPPC_CAP1. Bits 0:7 of a new MSR MSR_AMD_CPPC_REQ2 (0xc00102b5) will be used to specify the desired floor performance level for that CPU. Add support for the aforementioned MSR_AMD_CPPC_REQ2, and macros for parsing and updating the relevant bits from MSR_AMD_CPPC_CAP1 and MSR_AMD_CPPC_REQ2. On boot if the default value of the MSR_AMD_CPPC_REQ2[7:0] (Floor Perf) is lower than CPPC.lowest_perf, and thus invalid, initialize it to MSR_AMD_CPPC_CAP1.nominal_perf which is a sane default value. Save the boot-time floor_perf during amd_pstate_init_floor_perf(). In a subsequent patch it will be restored in the suspend, offline, and exit paths, mirroring how bios_min_perf is handled for MSR_AMD_CPPC_REQ. Link: https://docs.amd.com/v/u/en-US/69206_1.10_AMD64_CPPC_PUB Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- arch/x86/include/asm/msr-index.h | 5 +++ drivers/cpufreq/amd-pstate.c | 78 +++++++++++++++++++++++++++++++++++++++- drivers/cpufreq/amd-pstate.h | 8 +++++ 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6673601246b3..e126c7fb69cf 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -765,12 +765,14 @@ #define MSR_AMD_CPPC_CAP2 0xc00102b2 #define MSR_AMD_CPPC_REQ 0xc00102b3 #define MSR_AMD_CPPC_STATUS 0xc00102b4 +#define MSR_AMD_CPPC_REQ2 0xc00102b5 /* Masks for use with MSR_AMD_CPPC_CAP1 */ #define AMD_CPPC_LOWEST_PERF_MASK GENMASK(7, 0) #define AMD_CPPC_LOWNONLIN_PERF_MASK GENMASK(15, 8) #define AMD_CPPC_NOMINAL_PERF_MASK GENMASK(23, 16) #define AMD_CPPC_HIGHEST_PERF_MASK GENMASK(31, 24) +#define AMD_CPPC_FLOOR_PERF_CNT_MASK GENMASK_ULL(39, 32) /* Masks for use with MSR_AMD_CPPC_REQ */ #define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0) @@ -778,6 +780,9 @@ #define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16) #define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24) +/* Masks for use with MSR_AMD_CPPC_REQ2 */ +#define AMD_CPPC_FLOOR_PERF_MASK GENMASK(7, 0) + /* AMD Performance Counter Global Status and Control MSRs */ #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 4de2037a414c..53b8173ff183 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -329,6 +329,65 @@ static inline int amd_pstate_set_epp(struct cpufreq_policy *policy, u8 epp) return static_call(amd_pstate_set_epp)(policy, epp); } +static int amd_pstate_set_floor_perf(struct cpufreq_policy *policy, u8 perf) +{ + struct amd_cpudata *cpudata = policy->driver_data; + u64 value, prev; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_CPPC_PERF_PRIO)) + return 0; + + value = prev = READ_ONCE(cpudata->cppc_req2_cached); + FIELD_MODIFY(AMD_CPPC_FLOOR_PERF_MASK, &value, perf); + + if (value == prev) + return 0; + + ret = wrmsrq_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ2, value); + if (ret) { + pr_err("failed to set CPPC REQ2 value. Error (%d)\n", ret); + return ret; + } + + WRITE_ONCE(cpudata->cppc_req2_cached, value); + + return ret; +} + +static int amd_pstate_init_floor_perf(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + u8 floor_perf; + u64 value; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_CPPC_PERF_PRIO)) + return 0; + + ret = rdmsrq_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ2, &value); + if (ret) { + pr_err("failed to read CPPC REQ2 value. Error (%d)\n", ret); + return ret; + } + + WRITE_ONCE(cpudata->cppc_req2_cached, value); + floor_perf = FIELD_GET(AMD_CPPC_FLOOR_PERF_MASK, + cpudata->cppc_req2_cached); + + /* Set a sane value for floor_perf if the default value is invalid */ + if (floor_perf < cpudata->perf.lowest_perf) { + floor_perf = cpudata->perf.nominal_perf; + ret = amd_pstate_set_floor_perf(policy, floor_perf); + if (ret) + return ret; + } + + cpudata->bios_floor_perf = floor_perf; + + return 0; +} + static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp) { struct amd_cpudata *cpudata = policy->driver_data; @@ -426,6 +485,7 @@ static int msr_init_perf(struct amd_cpudata *cpudata) perf.lowest_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1); WRITE_ONCE(cpudata->perf, perf); WRITE_ONCE(cpudata->prefcore_ranking, FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, cap1)); + WRITE_ONCE(cpudata->floor_perf_cnt, FIELD_GET(AMD_CPPC_FLOOR_PERF_CNT_MASK, cap1)); return 0; } @@ -1024,6 +1084,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) cpudata->nominal_freq, perf.highest_perf); + policy->driver_data = cpudata; ret = amd_pstate_cppc_enable(policy); if (ret) goto free_cpudata1; @@ -1036,6 +1097,12 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) if (cpu_feature_enabled(X86_FEATURE_CPPC)) policy->fast_switch_possible = true; + ret = amd_pstate_init_floor_perf(policy); + if (ret) { + dev_err(dev, "Failed to initialize Floor Perf (%d)\n", ret); + goto free_cpudata1; + } + ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE); if (ret < 0) { @@ -1050,7 +1117,6 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) goto free_cpudata2; } - policy->driver_data = cpudata; if (!current_pstate_driver->adjust_perf) current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; @@ -1062,6 +1128,7 @@ free_cpudata2: free_cpudata1: pr_warn("Failed to initialize CPU %d: %d\n", policy->cpu, ret); kfree(cpudata); + policy->driver_data = NULL; return ret; } @@ -1072,6 +1139,7 @@ static void amd_pstate_cpu_exit(struct cpufreq_policy *policy) /* Reset CPPC_REQ MSR to the BIOS value */ amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); + amd_pstate_set_floor_perf(policy, cpudata->bios_floor_perf); freq_qos_remove_request(&cpudata->req[1]); freq_qos_remove_request(&cpudata->req[0]); @@ -1598,6 +1666,12 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; + ret = amd_pstate_init_floor_perf(policy); + if (ret) { + dev_err(dev, "Failed to initialize Floor Perf (%d)\n", ret); + goto free_cpudata1; + } + current_pstate_driver->adjust_perf = NULL; return 0; @@ -1605,6 +1679,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) free_cpudata1: pr_warn("Failed to initialize CPU %d: %d\n", policy->cpu, ret); kfree(cpudata); + policy->driver_data = NULL; return ret; } @@ -1617,6 +1692,7 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) /* Reset CPPC_REQ MSR to the BIOS value */ amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); + amd_pstate_set_floor_perf(policy, cpudata->bios_floor_perf); kfree(cpudata); policy->driver_data = NULL; diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index cb45fdca27a6..303da70b0afa 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -62,9 +62,14 @@ struct amd_aperf_mperf { * @cpu: CPU number * @req: constraint request to apply * @cppc_req_cached: cached performance request hints + * @cppc_req2_cached: cached value of MSR_AMD_CPPC_REQ2 * @perf: cached performance-related data * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher * priority. + * @floor_perf_cnt: Cached value of the number of distinct floor + * performance levels supported + * @bios_floor_perf: Cached value of the boot-time floor performance level from + * MSR_AMD_CPPC_REQ2 * @min_limit_freq: Cached value of policy->min (in khz) * @max_limit_freq: Cached value of policy->max (in khz) * @nominal_freq: the frequency (in khz) that mapped to nominal_perf @@ -87,10 +92,13 @@ struct amd_cpudata { struct freq_qos_request req[2]; u64 cppc_req_cached; + u64 cppc_req2_cached; union perf_cached perf; u8 prefcore_ranking; + u8 floor_perf_cnt; + u8 bios_floor_perf; u32 min_limit_freq; u32 max_limit_freq; u32 nominal_freq; -- cgit v1.2.3 From b9f103d0968bc5b33bff1b1eb11c756b2ac07c6c Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:50 +0530 Subject: amd-pstate: Add sysfs support for floor_freq and floor_count When Floor Performance feature is supported by the platform, expose two sysfs files: * amd_pstate_floor_freq to allow userspace to request the floor frequency for each CPU. * amd_pstate_floor_count which advertises the number of distinct levels of floor frequencies supported on this platform. Reset the floor_perf to bios_floor_perf in the suspend, offline, and exit paths, and restore the value to the cached user-request floor_freq on the resume and online paths mirroring how bios_min_perf is handled for MSR_AMD_CPPC_REQ. Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 93 +++++++++++++++++++++++++++++++++++++++++--- drivers/cpufreq/amd-pstate.h | 2 + 2 files changed, 89 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 53b8173ff183..a068c4457a8f 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -383,8 +383,10 @@ static int amd_pstate_init_floor_perf(struct cpufreq_policy *policy) return ret; } - cpudata->bios_floor_perf = floor_perf; + cpudata->bios_floor_perf = floor_perf; + cpudata->floor_freq = perf_to_freq(cpudata->perf, cpudata->nominal_freq, + floor_perf); return 0; } @@ -1288,6 +1290,46 @@ static ssize_t show_energy_performance_preference( return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); } +static ssize_t store_amd_pstate_floor_freq(struct cpufreq_policy *policy, + const char *buf, size_t count) +{ + struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf = READ_ONCE(cpudata->perf); + unsigned int freq; + u8 floor_perf; + int ret; + + ret = kstrtouint(buf, 0, &freq); + if (ret) + return ret; + + if (freq < policy->cpuinfo.min_freq || freq > policy->max) + return -EINVAL; + + floor_perf = freq_to_perf(perf, cpudata->nominal_freq, freq); + ret = amd_pstate_set_floor_perf(policy, floor_perf); + + if (!ret) + cpudata->floor_freq = freq; + + return ret ?: count; +} + +static ssize_t show_amd_pstate_floor_freq(struct cpufreq_policy *policy, char *buf) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + return sysfs_emit(buf, "%u\n", cpudata->floor_freq); +} + +static ssize_t show_amd_pstate_floor_count(struct cpufreq_policy *policy, char *buf) +{ + struct amd_cpudata *cpudata = policy->driver_data; + u8 count = cpudata->floor_perf_cnt; + + return sysfs_emit(buf, "%u\n", count); +} + cpufreq_freq_attr_ro(amd_pstate_max_freq); cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); @@ -1296,6 +1338,8 @@ cpufreq_freq_attr_ro(amd_pstate_prefcore_ranking); cpufreq_freq_attr_ro(amd_pstate_hw_prefcore); cpufreq_freq_attr_rw(energy_performance_preference); cpufreq_freq_attr_ro(energy_performance_available_preferences); +cpufreq_freq_attr_rw(amd_pstate_floor_freq); +cpufreq_freq_attr_ro(amd_pstate_floor_count); struct freq_attr_visibility { struct freq_attr *attr; @@ -1320,6 +1364,12 @@ static bool epp_visibility(void) return cppc_state == AMD_PSTATE_ACTIVE; } +/* Determines whether amd_pstate_floor_freq related attributes should be visible */ +static bool floor_freq_visibility(void) +{ + return cpu_feature_enabled(X86_FEATURE_CPPC_PERF_PRIO); +} + static struct freq_attr_visibility amd_pstate_attr_visibility[] = { {&amd_pstate_max_freq, always_visible}, {&amd_pstate_lowest_nonlinear_freq, always_visible}, @@ -1328,6 +1378,8 @@ static struct freq_attr_visibility amd_pstate_attr_visibility[] = { {&amd_pstate_hw_prefcore, prefcore_visibility}, {&energy_performance_preference, epp_visibility}, {&energy_performance_available_preferences, epp_visibility}, + {&amd_pstate_floor_freq, floor_freq_visibility}, + {&amd_pstate_floor_count, floor_freq_visibility}, }; static struct freq_attr **get_freq_attrs(void) @@ -1748,24 +1800,39 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) static int amd_pstate_cpu_online(struct cpufreq_policy *policy) { - return amd_pstate_cppc_enable(policy); + struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf = READ_ONCE(cpudata->perf); + u8 cached_floor_perf; + int ret; + + ret = amd_pstate_cppc_enable(policy); + if (ret) + return ret; + + cached_floor_perf = freq_to_perf(perf, cpudata->nominal_freq, cpudata->floor_freq); + return amd_pstate_set_floor_perf(policy, cached_floor_perf); } static int amd_pstate_cpu_offline(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; union perf_cached perf = READ_ONCE(cpudata->perf); + int ret; /* * Reset CPPC_REQ MSR to the BIOS value, this will allow us to retain the BIOS specified * min_perf value across kexec reboots. If this CPU is just onlined normally after this, the * limits, epp and desired perf will get reset to the cached values in cpudata struct */ - return amd_pstate_update_perf(policy, perf.bios_min_perf, + ret = amd_pstate_update_perf(policy, perf.bios_min_perf, FIELD_GET(AMD_CPPC_DES_PERF_MASK, cpudata->cppc_req_cached), FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached), false); + if (ret) + return ret; + + return amd_pstate_set_floor_perf(policy, cpudata->bios_floor_perf); } static int amd_pstate_suspend(struct cpufreq_policy *policy) @@ -1787,6 +1854,10 @@ static int amd_pstate_suspend(struct cpufreq_policy *policy) if (ret) return ret; + ret = amd_pstate_set_floor_perf(policy, cpudata->bios_floor_perf); + if (ret) + return ret; + /* set this flag to avoid setting core offline*/ cpudata->suspended = true; @@ -1798,15 +1869,24 @@ static int amd_pstate_resume(struct cpufreq_policy *policy) struct amd_cpudata *cpudata = policy->driver_data; union perf_cached perf = READ_ONCE(cpudata->perf); int cur_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->cur); + u8 cached_floor_perf; + int ret; /* Set CPPC_REQ to last sane value until the governor updates it */ - return amd_pstate_update_perf(policy, perf.min_limit_perf, cur_perf, perf.max_limit_perf, - 0U, false); + ret = amd_pstate_update_perf(policy, perf.min_limit_perf, cur_perf, perf.max_limit_perf, + 0U, false); + if (ret) + return ret; + + cached_floor_perf = freq_to_perf(perf, cpudata->nominal_freq, cpudata->floor_freq); + return amd_pstate_set_floor_perf(policy, cached_floor_perf); } static int amd_pstate_epp_resume(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf = READ_ONCE(cpudata->perf); + u8 cached_floor_perf; if (cpudata->suspended) { int ret; @@ -1819,7 +1899,8 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy) cpudata->suspended = false; } - return 0; + cached_floor_perf = freq_to_perf(perf, cpudata->nominal_freq, cpudata->floor_freq); + return amd_pstate_set_floor_perf(policy, cached_floor_perf); } static struct cpufreq_driver amd_pstate_driver = { diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 303da70b0afa..453adfb445f8 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -74,6 +74,7 @@ struct amd_aperf_mperf { * @max_limit_freq: Cached value of policy->max (in khz) * @nominal_freq: the frequency (in khz) that mapped to nominal_perf * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf + * @floor_freq: Cached value of the user requested floor_freq * @cur: Difference of Aperf/Mperf/tsc count between last and current sample * @prev: Last Aperf/Mperf/tsc count value read from register * @freq: current cpu frequency value (in khz) @@ -103,6 +104,7 @@ struct amd_cpudata { u32 max_limit_freq; u32 nominal_freq; u32 lowest_nonlinear_freq; + u32 floor_freq; struct amd_aperf_mperf cur; struct amd_aperf_mperf prev; -- cgit v1.2.3 From 30c63f723440f12626a19da5a93e094da29af51e Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:51 +0530 Subject: amd-pstate: Introduce a tracepoint trace_amd_pstate_cppc_req2() Introduce a new tracepoint trace_amd_pstate_cppc_req2() to track updates to MSR_AMD_CPPC_REQ2. Invoke this while changing the Floor Perf. Reviewed-by: Mario Limonciello Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate-trace.h | 35 +++++++++++++++++++++++++++++++++++ drivers/cpufreq/amd-pstate.c | 14 +++++++++++--- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h index 32e1bdc588c5..91fa073b2be4 100644 --- a/drivers/cpufreq/amd-pstate-trace.h +++ b/drivers/cpufreq/amd-pstate-trace.h @@ -133,6 +133,41 @@ TRACE_EVENT(amd_pstate_epp_perf, ) ); +TRACE_EVENT(amd_pstate_cppc_req2, + + TP_PROTO(unsigned int cpu_id, + u8 floor_perf, + bool changed, + int err_code + ), + + TP_ARGS(cpu_id, + floor_perf, + changed, + err_code), + + TP_STRUCT__entry( + __field(unsigned int, cpu_id) + __field(u8, floor_perf) + __field(bool, changed) + __field(int, err_code) + ), + + TP_fast_assign( + __entry->cpu_id = cpu_id; + __entry->floor_perf = floor_perf; + __entry->changed = changed; + __entry->err_code = err_code; + ), + + TP_printk("cpu%u: floor_perf=%u, changed=%u (error = %d)", + __entry->cpu_id, + __entry->floor_perf, + __entry->changed, + __entry->err_code + ) +); + #endif /* _AMD_PSTATE_TRACE_H */ /* This part must be outside protection */ diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index a068c4457a8f..5eae74a67aeb 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -333,6 +333,7 @@ static int amd_pstate_set_floor_perf(struct cpufreq_policy *policy, u8 perf) { struct amd_cpudata *cpudata = policy->driver_data; u64 value, prev; + bool changed; int ret; if (!cpu_feature_enabled(X86_FEATURE_CPPC_PERF_PRIO)) @@ -341,17 +342,24 @@ static int amd_pstate_set_floor_perf(struct cpufreq_policy *policy, u8 perf) value = prev = READ_ONCE(cpudata->cppc_req2_cached); FIELD_MODIFY(AMD_CPPC_FLOOR_PERF_MASK, &value, perf); - if (value == prev) - return 0; + changed = value != prev; + if (!changed) { + ret = 0; + goto out_trace; + } ret = wrmsrq_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ2, value); if (ret) { + changed = false; pr_err("failed to set CPPC REQ2 value. Error (%d)\n", ret); - return ret; + goto out_trace; } WRITE_ONCE(cpudata->cppc_req2_cached, value); +out_trace: + if (trace_amd_pstate_cppc_req2_enabled()) + trace_amd_pstate_cppc_req2(cpudata->cpu, perf, changed, ret); return ret; } -- cgit v1.2.3 From c6a2b750de13db9103db29c64927bec3919232b5 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:52 +0530 Subject: amd-pstate-ut: Add module parameter to select testcases Currently when amd-pstate-ut test module is loaded, it runs all the tests from amd_pstate_ut_cases[] array. Add a module parameter named "test_list" that accepts a comma-delimited list of test names, allowing users to run a selected subset of tests. When the parameter is omitted or empty, all tests are run as before. Signed-off-by: Gautham R. Shenoy Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate-ut.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 447b9aa5ce40..3dcdf56883a6 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -35,6 +35,10 @@ #include "amd-pstate.h" +static char *test_list; +module_param(test_list, charp, 0444); +MODULE_PARM_DESC(test_list, + "Comma-delimited list of tests to run (empty means run all tests)"); struct amd_pstate_ut_struct { const char *name; @@ -58,6 +62,25 @@ static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = { {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver } }; +static bool test_in_list(const char *list, const char *name) +{ + size_t name_len = strlen(name); + const char *p = list; + + while (*p) { + const char *sep = strchr(p, ','); + size_t token_len = sep ? sep - p : strlen(p); + + if (token_len == name_len && !strncmp(p, name, token_len)) + return true; + if (!sep) + break; + p = sep + 1; + } + + return false; +} + static bool get_shared_mem(void) { bool result = false; @@ -275,7 +298,13 @@ static int __init amd_pstate_ut_init(void) u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases); for (i = 0; i < arr_size; i++) { - int ret = amd_pstate_ut_cases[i].func(i); + int ret; + + if (test_list && *test_list && + !test_in_list(test_list, amd_pstate_ut_cases[i].name)) + continue; + + ret = amd_pstate_ut_cases[i].func(i); if (ret) pr_err("%-4d %-20s\t fail: %d!\n", i+1, amd_pstate_ut_cases[i].name, ret); -- cgit v1.2.3 From 3b90e5a4176acacc6781b9ac84cdc5ac53671eee Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:53 +0530 Subject: amd-pstate-ut: Add a testcase to validate the visibility of driver attributes amd-pstate driver has per-attribute visibility functions to dynamically control which sysfs freq_attrs are exposed based on the platform capabilities and the current amd_pstate mode. However, there is no test coverage to validate that the driver's live attribute list matches the expected visibility for each mode. Add amd_pstate_ut_check_freq_attrs() to the amd-pstate unit test module. For each enabled mode (passive, active, guided), the test independently derives the expected visibility of each attribute: - Core attributes (max_freq, lowest_nonlinear_freq, highest_perf) are always expected. - Prefcore attributes (prefcore_ranking, hw_prefcore) are expected only when cpudata->hw_prefcore indicates platform support. - EPP attributes (energy_performance_preference, energy_performance_available_preferences) are expected only in active mode. - Floor frequency attributes (floor_freq, floor_count) are expected only when X86_FEATURE_CPPC_PERF_PRIO is present. Compare these independent expectations against the live driver's attr array, catching bugs such as attributes leaking into wrong modes or visibility functions checking incorrect conditions. Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate-ut.c | 139 ++++++++++++++++++++++++++++++++++++++-- drivers/cpufreq/amd-pstate.c | 8 +++ drivers/cpufreq/amd-pstate.h | 4 ++ 3 files changed, 146 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 3dcdf56883a6..1f62ab6438b4 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -23,6 +23,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include +#include #include #include #include @@ -53,13 +55,15 @@ static int amd_pstate_ut_check_enabled(u32 index); static int amd_pstate_ut_check_perf(u32 index); static int amd_pstate_ut_check_freq(u32 index); static int amd_pstate_ut_check_driver(u32 index); +static int amd_pstate_ut_check_freq_attrs(u32 index); static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = { - {"amd_pstate_ut_acpi_cpc_valid", amd_pstate_ut_acpi_cpc_valid }, - {"amd_pstate_ut_check_enabled", amd_pstate_ut_check_enabled }, - {"amd_pstate_ut_check_perf", amd_pstate_ut_check_perf }, - {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq }, - {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver } + {"amd_pstate_ut_acpi_cpc_valid", amd_pstate_ut_acpi_cpc_valid }, + {"amd_pstate_ut_check_enabled", amd_pstate_ut_check_enabled }, + {"amd_pstate_ut_check_perf", amd_pstate_ut_check_perf }, + {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq }, + {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver }, + {"amd_pstate_ut_check_freq_attrs", amd_pstate_ut_check_freq_attrs }, }; static bool test_in_list(const char *list, const char *name) @@ -293,6 +297,131 @@ out: return ret; } +enum attr_category { + ATTR_ALWAYS, + ATTR_PREFCORE, + ATTR_EPP, + ATTR_FLOOR_FREQ, +}; + +static const struct { + const char *name; + enum attr_category category; +} expected_freq_attrs[] = { + {"amd_pstate_max_freq", ATTR_ALWAYS}, + {"amd_pstate_lowest_nonlinear_freq", ATTR_ALWAYS}, + {"amd_pstate_highest_perf", ATTR_ALWAYS}, + {"amd_pstate_prefcore_ranking", ATTR_PREFCORE}, + {"amd_pstate_hw_prefcore", ATTR_PREFCORE}, + {"energy_performance_preference", ATTR_EPP}, + {"energy_performance_available_preferences", ATTR_EPP}, + {"amd_pstate_floor_freq", ATTR_FLOOR_FREQ}, + {"amd_pstate_floor_count", ATTR_FLOOR_FREQ}, +}; + +static bool attr_in_driver(struct freq_attr **driver_attrs, const char *name) +{ + int j; + + for (j = 0; driver_attrs[j]; j++) { + if (!strcmp(driver_attrs[j]->attr.name, name)) + return true; + } + return false; +} + +/* + * Verify that for each mode the driver's live ->attr array contains exactly + * the attributes that should be visible. Expected visibility is derived + * independently from hw_prefcore, cpu features, and the current mode — + * not from the driver's own visibility functions. + */ +static int amd_pstate_ut_check_freq_attrs(u32 index) +{ + enum amd_pstate_mode orig_mode = amd_pstate_get_status(); + static const enum amd_pstate_mode modes[] = { + AMD_PSTATE_PASSIVE, AMD_PSTATE_ACTIVE, AMD_PSTATE_GUIDED, + }; + bool has_prefcore, has_floor_freq; + int m, i, ret; + + has_floor_freq = cpu_feature_enabled(X86_FEATURE_CPPC_PERF_PRIO); + + /* + * Determine prefcore support from any online CPU's cpudata. + * hw_prefcore reflects the platform-wide decision made at init. + */ + has_prefcore = false; + for_each_online_cpu(i) { + struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; + struct amd_cpudata *cpudata; + + policy = cpufreq_cpu_get(i); + if (!policy) + continue; + cpudata = policy->driver_data; + has_prefcore = cpudata->hw_prefcore; + break; + } + + for (m = 0; m < ARRAY_SIZE(modes); m++) { + struct freq_attr **driver_attrs; + + ret = amd_pstate_set_mode(modes[m]); + if (ret) + goto out; + + driver_attrs = amd_pstate_get_current_attrs(); + if (!driver_attrs) { + pr_err("%s: no driver attrs in mode %s\n", + __func__, amd_pstate_get_mode_string(modes[m])); + ret = -EINVAL; + goto out; + } + + for (i = 0; i < ARRAY_SIZE(expected_freq_attrs); i++) { + bool expected, found; + + switch (expected_freq_attrs[i].category) { + case ATTR_ALWAYS: + expected = true; + break; + case ATTR_PREFCORE: + expected = has_prefcore; + break; + case ATTR_EPP: + expected = (modes[m] == AMD_PSTATE_ACTIVE); + break; + case ATTR_FLOOR_FREQ: + expected = has_floor_freq; + break; + default: + expected = false; + break; + } + + found = attr_in_driver(driver_attrs, + expected_freq_attrs[i].name); + + if (expected != found) { + pr_err("%s: mode %s: attr %s expected %s but is %s\n", + __func__, + amd_pstate_get_mode_string(modes[m]), + expected_freq_attrs[i].name, + expected ? "visible" : "hidden", + found ? "visible" : "hidden"); + ret = -EINVAL; + goto out; + } + } + } + + ret = 0; +out: + amd_pstate_set_mode(orig_mode); + return ret; +} + static int __init amd_pstate_ut_init(void) { u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases); diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5eae74a67aeb..ed9fd4155a25 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1390,6 +1390,14 @@ static struct freq_attr_visibility amd_pstate_attr_visibility[] = { {&amd_pstate_floor_count, floor_freq_visibility}, }; +struct freq_attr **amd_pstate_get_current_attrs(void) +{ + if (!current_pstate_driver) + return NULL; + return current_pstate_driver->attr; +} +EXPORT_SYMBOL_GPL(amd_pstate_get_current_attrs); + static struct freq_attr **get_freq_attrs(void) { bool attr_visible[ARRAY_SIZE(amd_pstate_attr_visibility)]; diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 453adfb445f8..faead0b19a8a 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -134,4 +134,8 @@ const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode); int amd_pstate_get_status(void); int amd_pstate_update_status(const char *buf, size_t size); +struct freq_attr; + +struct freq_attr **amd_pstate_get_current_attrs(void); + #endif /* _LINUX_AMD_PSTATE_H */ -- cgit v1.2.3 From 7e1cf24efba4b59a275e87372267dadcb7fd1850 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:54 +0530 Subject: Documentation/amd-pstate: List amd_pstate_hw_prefcore sysfs file Add the missing amd_pstate_hw_prefcore filenames in the sysfs listing example leading to the descriptions of these parameters. Clarify when will the file be visible. Fixes: b96b82d1af7f ("cpufreq: amd-pstate: Add documentation for `amd_pstate_hw_prefcore`") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index e1771f2225d5..b8c846cbf301 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -239,6 +239,7 @@ control its functionality at the system level. They are located in the root@hr-test1:/home/ray# ls /sys/devices/system/cpu/cpufreq/policy0/*amd* /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_highest_perf + /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_hw_prefcore /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_lowest_nonlinear_freq /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_max_freq @@ -264,8 +265,9 @@ This attribute is read-only. ``amd_pstate_hw_prefcore`` -Whether the platform supports the preferred core feature and it has been -enabled. This attribute is read-only. +Whether the platform supports the preferred core feature and it has +been enabled. This attribute is read-only. This file is only visible +on platforms which support the preferred core feature. ``amd_pstate_prefcore_ranking`` -- cgit v1.2.3 From a5bc4c44aeec2920931e17db7f93965fcd69ee2f Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:55 +0530 Subject: Documentation/amd-pstate: List amd_pstate_prefcore_ranking sysfs file Add the missing amd_pstate_prefcore_ranking filenames in the sysfs listing example leading to the descriptions of these parameters. Clarify when will the file be visible. Fixes: 15a2b764ea7c ("amd-pstate: Add missing documentation for `amd_pstate_prefcore_ranking`") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index b8c846cbf301..b31a478c28ba 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -242,6 +242,7 @@ control its functionality at the system level. They are located in the /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_hw_prefcore /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_lowest_nonlinear_freq /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_max_freq + /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_prefcore_ranking ``amd_pstate_highest_perf / amd_pstate_max_freq`` @@ -273,7 +274,8 @@ on platforms which support the preferred core feature. The performance ranking of the core. This number doesn't have any unit, but larger numbers are preferred at the time of reading. This can change at -runtime based on platform conditions. This attribute is read-only. +runtime based on platform conditions. This attribute is read-only. This file +is only visible on platforms which support the preferred core feature. ``energy_performance_available_preferences`` -- cgit v1.2.3 From 88d2ca6a68fcc43d65b3b056cb8c481804b100b0 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 26 Mar 2026 17:17:56 +0530 Subject: Documentation/amd-pstate: Add documentation for amd_pstate_floor_{freq,count} Add documentation for the sysfs files /sys/devices/system/cpu/cpufreq/policy*/amd_pstate_floor_freq and /sys/devices/system/cpu/cpufreq/policy*/amd_pstate_floor_count. Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index b31a478c28ba..d6c2f233ab23 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -242,6 +242,8 @@ control its functionality at the system level. They are located in the /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_hw_prefcore /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_lowest_nonlinear_freq /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_max_freq + /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_floor_freq + /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_floor_count /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_prefcore_ranking @@ -277,6 +279,36 @@ larger numbers are preferred at the time of reading. This can change at runtime based on platform conditions. This attribute is read-only. This file is only visible on platforms which support the preferred core feature. +``amd_pstate_floor_freq`` + +The floor frequency associated with each CPU. Userspace can write any +value between ``cpuinfo_min_freq`` and ``scaling_max_freq`` into this +file. When the system is under power or thermal constraints, the +platform firmware will attempt to throttle the CPU frequency to the +value specified in ``amd_pstate_floor_freq`` before throttling it +further. This allows userspace to specify different floor frequencies +to different CPUs. For optimal results, threads of the same core +should have the same floor frequency value. This file is only visible +on platforms that support the CPPC Performance Priority feature. + + +``amd_pstate_floor_count`` + +The number of distinct Floor Performance levels supported by the +platform. For example, if this value is 2, then the number of unique +values obtained from the command ``cat +/sys/devices/system/cpu/cpufreq/policy*/amd_pstate_floor_freq | +sort -n | uniq`` should be at most this number for the behavior +described in ``amd_pstate_floor_freq`` to take effect. A zero value +implies that the platform supports unlimited floor performance levels. +This file is only visible on platforms that support the CPPC +Performance Priority feature. + +**Note**: When ``amd_pstate_floor_count`` is non-zero, the frequency to +which the CPU is throttled under power or thermal constraints is +undefined when the number of unique values of ``amd_pstate_floor_freq`` +across all CPUs in the system exceeds ``amd_pstate_floor_count``. + ``energy_performance_available_preferences`` A list of all the supported EPP preferences that could be used for -- cgit v1.2.3 From 8cdc494013dfcd48f31eafe19b18fd67c224dd8a Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 26 Mar 2026 14:36:20 -0500 Subject: cpufreq/amd-pstate: Cache the max frequency in cpudata The value of maximum frequency is fixed and never changes. Doing calculations every time based off of perf is unnecessary. Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20260326193620.649441-1-mario.limonciello@amd.com Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 27 +++++++++------------------ drivers/cpufreq/amd-pstate.h | 2 ++ 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ed9fd4155a25..f207252eb5f5 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -826,15 +826,13 @@ static void amd_pstate_adjust_perf(unsigned int cpu, static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) { struct amd_cpudata *cpudata = policy->driver_data; - union perf_cached perf = READ_ONCE(cpudata->perf); - u32 nominal_freq, max_freq; + u32 nominal_freq; int ret = 0; nominal_freq = READ_ONCE(cpudata->nominal_freq); - max_freq = perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf); if (on) - policy->cpuinfo.max_freq = max_freq; + policy->cpuinfo.max_freq = cpudata->max_freq; else if (policy->cpuinfo.max_freq > nominal_freq) policy->cpuinfo.max_freq = nominal_freq; @@ -1021,13 +1019,15 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) WRITE_ONCE(cpudata->nominal_freq, nominal_freq); + /* max_freq is calculated according to (nominal_freq * highest_perf)/nominal_perf */ max_freq = perf_to_freq(perf, nominal_freq, perf.highest_perf); + WRITE_ONCE(cpudata->max_freq, max_freq); + lowest_nonlinear_freq = perf_to_freq(perf, nominal_freq, perf.lowest_nonlinear_perf); WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); /** * Below values need to be initialized correctly, otherwise driver will fail to load - * max_freq is calculated according to (nominal_freq * highest_perf)/nominal_perf * lowest_nonlinear_freq is a value between [min_freq, nominal_freq] * Check _CPC in ACPI table objects if any values are incorrect */ @@ -1090,9 +1090,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.min_freq = policy->min = perf_to_freq(perf, cpudata->nominal_freq, perf.lowest_perf); - policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf, - cpudata->nominal_freq, - perf.highest_perf); + policy->cpuinfo.max_freq = policy->max = cpudata->max_freq; policy->driver_data = cpudata; ret = amd_pstate_cppc_enable(policy); @@ -1167,14 +1165,9 @@ static void amd_pstate_cpu_exit(struct cpufreq_policy *policy) static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, char *buf) { - struct amd_cpudata *cpudata; - union perf_cached perf; - - cpudata = policy->driver_data; - perf = READ_ONCE(cpudata->perf); + struct amd_cpudata *cpudata = policy->driver_data; - return sysfs_emit(buf, "%u\n", - perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf)); + return sysfs_emit(buf, "%u\n", cpudata->max_freq); } static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, @@ -1702,9 +1695,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.min_freq = policy->min = perf_to_freq(perf, cpudata->nominal_freq, perf.lowest_perf); - policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf, - cpudata->nominal_freq, - perf.highest_perf); + policy->cpuinfo.max_freq = policy->max = cpudata->max_freq; policy->driver_data = cpudata; ret = amd_pstate_cppc_enable(policy); diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index faead0b19a8a..32b8b26ce388 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -73,6 +73,7 @@ struct amd_aperf_mperf { * @min_limit_freq: Cached value of policy->min (in khz) * @max_limit_freq: Cached value of policy->max (in khz) * @nominal_freq: the frequency (in khz) that mapped to nominal_perf + * @max_freq: in ideal conditions the maximum frequency (in khz) possible frequency * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf * @floor_freq: Cached value of the user requested floor_freq * @cur: Difference of Aperf/Mperf/tsc count between last and current sample @@ -103,6 +104,7 @@ struct amd_cpudata { u32 min_limit_freq; u32 max_limit_freq; u32 nominal_freq; + u32 max_freq; u32 lowest_nonlinear_freq; u32 floor_freq; -- cgit v1.2.3 From a362ae6e7e85bca4c870c37085d7793c4beec360 Mon Sep 17 00:00:00 2001 From: Ninad Naik Date: Tue, 31 Mar 2026 00:38:55 +0530 Subject: Documentation: amd-pstate: fix dead links in the reference section The links for AMD64 Architecture Programmer's Manual and PPR for AMD Family 19h Model 51h, Revision A1 Processors redirect to a generic page. Update the links to the working ones. Signed-off-by: Ninad Naik Link: https://lore.kernel.org/r/20260330190855.1115304-1-ninadnaik07@gmail.com Acked-by: Mario Limonciello (AMD) Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index d6c2f233ab23..b43675b7f739 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -826,13 +826,13 @@ Reference =========== .. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming, - https://www.amd.com/system/files/TechDocs/24593.pdf + https://docs.amd.com/v/u/en-US/24593_3.44_APM_Vol2 .. [2] Advanced Configuration and Power Interface Specification, https://uefi.org/sites/default/files/resources/ACPI_Spec_6_4_Jan22.pdf .. [3] Processor Programming Reference (PPR) for AMD Family 19h Model 51h, Revision A1 Processors - https://www.amd.com/system/files/TechDocs/56569-A1-PUB.zip + https://docs.amd.com/v/u/en-US/56569-A1-PUB_3.03 .. [4] Linux Kernel Selftests, https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html -- cgit v1.2.3 From e30ca6dd5345c5b8ba05f346a8e81105352fe571 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sun, 29 Mar 2026 15:38:07 -0500 Subject: cpufreq/amd-pstate: Add dynamic energy performance preference Dynamic energy performance preference changes the EPP profile based on whether the machine is running on AC or DC power. A notification chain from the power supply core is used to adjust EPP values on plug in or plug out events. When enabled, the driver exposes a sysfs toggle for dynamic EPP, blocks manual writes to energy_performance_preference while it "owns" the EPP updates. For non-server systems: * the default EPP for AC mode is `performance`. * the default EPP for DC mode is `balance_performance`. For server systems dynamic EPP is mostly a no-op. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 18 +++- drivers/cpufreq/Kconfig.x86 | 12 +++ drivers/cpufreq/amd-pstate.c | 128 ++++++++++++++++++++++++++-- drivers/cpufreq/amd-pstate.h | 10 ++- 4 files changed, 160 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index b43675b7f739..bb1341763882 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -325,7 +325,7 @@ and user can change current preference according to energy or performance needs Please get all support profiles list from ``energy_performance_available_preferences`` attribute, all the profiles are integer values defined between 0 to 255 when EPP feature is enabled by platform -firmware, if EPP feature is disabled, driver will ignore the written value +firmware, but if the dynamic EPP feature is enabled, driver will block writes. This attribute is read-write. ``boost`` @@ -347,6 +347,22 @@ boost or `1` to enable it, for the respective CPU using the sysfs path Other performance and frequency values can be read back from ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`. +Dynamic energy performance profile +================================== +The amd-pstate driver supports dynamically selecting the energy performance +profile based on whether the machine is running on AC or DC power. + +Whether this behavior is enabled by default depends on the kernel +config option `CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP`. This behavior can also be overridden +at runtime by the sysfs file ``/sys/devices/system/cpu/cpufreq/policyX/dynamic_epp``. + +When set to enabled, the driver will select a different energy performance +profile when the machine is running on battery or AC power. +When set to disabled, the driver will not change the energy performance profile +based on the power source and will not react to user desired power state. + +Attempting to manually write to the ``energy_performance_preference`` sysfs +file will fail when ``dynamic_epp`` is enabled. ``amd-pstate`` vs ``acpi-cpufreq`` ====================================== diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index 2c5c228408bf..cdaa8d858045 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -68,6 +68,18 @@ config X86_AMD_PSTATE_DEFAULT_MODE For details, take a look at: . +config X86_AMD_PSTATE_DYNAMIC_EPP + bool "AMD Processor P-State dynamic EPP support" + depends on X86_AMD_PSTATE + default n + help + Allow the kernel to dynamically change the energy performance + value from events like ACPI platform profile and AC adapter plug + events. + + This feature can also be changed at runtime, this configuration + option only sets the kernel default value behavior. + config X86_AMD_PSTATE_UT tristate "selftest for AMD Processor P-State driver" depends on X86 && ACPI_PROCESSOR diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index f207252eb5f5..379e7dd44252 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -86,6 +87,11 @@ static struct cpufreq_driver amd_pstate_driver; static struct cpufreq_driver amd_pstate_epp_driver; static int cppc_state = AMD_PSTATE_UNDEFINED; static bool amd_pstate_prefcore = true; +#ifdef CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP +static bool dynamic_epp = CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP; +#else +static bool dynamic_epp; +#endif static struct quirk_entry *quirks; /* @@ -1155,6 +1161,73 @@ static void amd_pstate_cpu_exit(struct cpufreq_policy *policy) kfree(cpudata); } +static int amd_pstate_get_balanced_epp(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + if (power_supply_is_system_supplied()) + return cpudata->epp_default_ac; + else + return cpudata->epp_default_dc; +} + +static int amd_pstate_power_supply_notifier(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct amd_cpudata *cpudata = container_of(nb, struct amd_cpudata, power_nb); + struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); + u8 epp; + int ret; + + if (event != PSY_EVENT_PROP_CHANGED) + return NOTIFY_OK; + + epp = amd_pstate_get_balanced_epp(policy); + + ret = amd_pstate_set_epp(policy, epp); + if (ret) + pr_warn("Failed to set CPU %d EPP %u: %d\n", cpudata->cpu, epp, ret); + + return NOTIFY_OK; +} +static void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + + if (cpudata->power_nb.notifier_call) + power_supply_unreg_notifier(&cpudata->power_nb); + cpudata->dynamic_epp = false; +} + +static int amd_pstate_set_dynamic_epp(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + int ret; + u8 epp; + + epp = amd_pstate_get_balanced_epp(policy); + ret = amd_pstate_set_epp(policy, epp); + if (ret) + return ret; + + /* only enable notifier if things will actually change */ + if (cpudata->epp_default_ac != cpudata->epp_default_dc) { + cpudata->power_nb.notifier_call = amd_pstate_power_supply_notifier; + ret = power_supply_reg_notifier(&cpudata->power_nb); + if (ret) + goto cleanup; + } + + cpudata->dynamic_epp = true; + + return 0; + +cleanup: + amd_pstate_clear_dynamic_epp(policy); + + return ret; +} + /* Sysfs attributes */ /* @@ -1244,14 +1317,19 @@ static ssize_t store_energy_performance_preference( ssize_t ret; u8 epp; + if (cpudata->dynamic_epp) { + pr_debug("EPP cannot be set when dynamic EPP is enabled\n"); + return -EBUSY; + } + ret = sysfs_match_string(energy_perf_strings, buf); if (ret < 0) return -EINVAL; - if (!ret) - epp = cpudata->epp_default; - else + if (ret) epp = epp_values[ret]; + else + epp = amd_pstate_get_balanced_epp(policy); if (epp > 0 && policy->policy == CPUFREQ_POLICY_PERFORMANCE) { pr_debug("EPP cannot be set under performance policy\n"); @@ -1259,6 +1337,8 @@ static ssize_t store_energy_performance_preference( } ret = amd_pstate_set_epp(policy, epp); + if (ret) + return ret; return ret ? ret : count; } @@ -1620,12 +1700,42 @@ static ssize_t prefcore_show(struct device *dev, return sysfs_emit(buf, "%s\n", str_enabled_disabled(amd_pstate_prefcore)); } +static ssize_t dynamic_epp_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", str_enabled_disabled(dynamic_epp)); +} + +static ssize_t dynamic_epp_store(struct device *a, struct device_attribute *b, + const char *buf, size_t count) +{ + bool enabled; + int ret; + + ret = kstrtobool(buf, &enabled); + if (ret) + return ret; + + if (dynamic_epp == enabled) + return -EINVAL; + + /* reinitialize with desired dynamic EPP value */ + dynamic_epp = enabled; + ret = amd_pstate_change_driver_mode(cppc_state); + if (ret) + dynamic_epp = false; + + return ret ? ret : count; +} + static DEVICE_ATTR_RW(status); static DEVICE_ATTR_RO(prefcore); +static DEVICE_ATTR_RW(dynamic_epp); static struct attribute *pstate_global_attributes[] = { &dev_attr_status.attr, &dev_attr_prefcore.attr, + &dev_attr_dynamic_epp.attr, NULL }; @@ -1715,13 +1825,17 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) if (amd_pstate_acpi_pm_profile_server() || amd_pstate_acpi_pm_profile_undefined()) { policy->policy = CPUFREQ_POLICY_PERFORMANCE; - cpudata->epp_default = amd_pstate_get_epp(cpudata); + cpudata->epp_default_ac = cpudata->epp_default_dc = amd_pstate_get_epp(cpudata); } else { policy->policy = CPUFREQ_POLICY_POWERSAVE; - cpudata->epp_default = AMD_CPPC_EPP_BALANCE_PERFORMANCE; + cpudata->epp_default_ac = AMD_CPPC_EPP_PERFORMANCE; + cpudata->epp_default_dc = AMD_CPPC_EPP_BALANCE_PERFORMANCE; } - ret = amd_pstate_set_epp(policy, cpudata->epp_default); + if (dynamic_epp) + ret = amd_pstate_set_dynamic_epp(policy); + else + ret = amd_pstate_set_epp(policy, amd_pstate_get_balanced_epp(policy)); if (ret) goto free_cpudata1; @@ -1753,6 +1867,8 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); amd_pstate_set_floor_perf(policy, cpudata->bios_floor_perf); + if (cpudata->dynamic_epp) + amd_pstate_clear_dynamic_epp(policy); kfree(cpudata); policy->driver_data = NULL; } diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 32b8b26ce388..d929ae3163b3 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -85,6 +85,11 @@ struct amd_aperf_mperf { * AMD P-State driver supports preferred core featue. * @epp_cached: Cached CPPC energy-performance preference value * @policy: Cpufreq policy value + * @suspended: If CPU core if offlined + * @epp_default_ac: Default EPP value for AC power source + * @epp_default_dc: Default EPP value for DC power source + * @dynamic_epp: Whether dynamic EPP is enabled + * @power_nb: Notifier block for power events * * The amd_cpudata is key private data for each CPU thread in AMD P-State, and * represents all the attributes and goals that AMD P-State requests at runtime. @@ -118,7 +123,10 @@ struct amd_cpudata { /* EPP feature related attributes*/ u32 policy; bool suspended; - u8 epp_default; + u8 epp_default_ac; + u8 epp_default_dc; + bool dynamic_epp; + struct notifier_block power_nb; }; /* -- cgit v1.2.3 From da8afb1c666a4a966f0ab91dc336df4c855bc7b2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sun, 29 Mar 2026 15:38:08 -0500 Subject: cpufreq/amd-pstate: add kernel command line to override dynamic epp Add `amd_dynamic_epp=enable` and `amd_dynamic_epp=disable` to override the kernel configuration option `CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP` locally. Signed-off-by: Mario Limonciello (AMD) Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ Documentation/admin-guide/pm/amd-pstate.rst | 7 +++++++ drivers/cpufreq/amd-pstate.c | 11 +++++++++++ 3 files changed, 25 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 03a550630644..9552819051cd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -493,6 +493,13 @@ Kernel parameters disable Disable amd-pstate preferred core. + amd_dynamic_epp= + [X86] + disable + Disable amd-pstate dynamic EPP. + enable + Enable amd-pstate dynamic EPP. + amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: , diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index bb1341763882..01e6ab10f996 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -474,6 +474,13 @@ For systems that support ``amd-pstate`` preferred core, the core rankings will always be advertised by the platform. But OS can choose to ignore that via the kernel parameter ``amd_prefcore=disable``. +``amd_dynamic_epp`` + +When AMD pstate is in auto mode, dynamic EPP will control whether the kernel +autonomously changes the EPP mode. The default is configured by +``CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP`` but can be explicitly enabled with +``amd_dynamic_epp=enable`` or disabled with ``amd_dynamic_epp=disable``. + User Space Interface in ``sysfs`` - General =========================================== diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 379e7dd44252..301e603e4966 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -2227,8 +2227,19 @@ static int __init amd_prefcore_param(char *str) return 0; } +static int __init amd_dynamic_epp_param(char *str) +{ + if (!strcmp(str, "disable")) + dynamic_epp = false; + if (!strcmp(str, "enable")) + dynamic_epp = true; + + return 0; +} + early_param("amd_pstate", amd_pstate_param); early_param("amd_prefcore", amd_prefcore_param); +early_param("amd_dynamic_epp", amd_dynamic_epp_param); MODULE_AUTHOR("Huang Rui "); MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver"); -- cgit v1.2.3 From 798c47593ccae7dd36c033e557f3f364a2056b9e Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sun, 29 Mar 2026 15:38:09 -0500 Subject: cpufreq/amd-pstate: Add support for platform profile class The platform profile core allows multiple drivers and devices to register platform profile support. When the legacy platform profile interface is used all drivers will adjust the platform profile as well. Add support for registering every CPU with the platform profile handler when dynamic EPP is enabled. The end result will be that changing the platform profile will modify EPP accordingly. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 4 +- drivers/cpufreq/Kconfig.x86 | 1 + drivers/cpufreq/amd-pstate.c | 106 ++++++++++++++++++++++++++-- drivers/cpufreq/amd-pstate.h | 6 ++ 4 files changed, 110 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index 01e6ab10f996..d68ddfea6a9d 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -357,7 +357,9 @@ config option `CONFIG_X86_AMD_PSTATE_DYNAMIC_EPP`. This behavior can also be ove at runtime by the sysfs file ``/sys/devices/system/cpu/cpufreq/policyX/dynamic_epp``. When set to enabled, the driver will select a different energy performance -profile when the machine is running on battery or AC power. +profile when the machine is running on battery or AC power. The driver will +also register with the platform profile handler to receive notifications of +user desired power state and react to those. When set to disabled, the driver will not change the energy performance profile based on the power source and will not react to user desired power state. diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index cdaa8d858045..a0dbb9808ae9 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -40,6 +40,7 @@ config X86_AMD_PSTATE select ACPI_PROCESSOR select ACPI_CPPC_LIB if X86_64 select CPU_FREQ_GOV_SCHEDUTIL if SMP + select ACPI_PLATFORM_PROFILE help This driver adds a CPUFreq driver which utilizes a fine grain processor performance frequency control range instead of legacy diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 301e603e4966..d553bbd3fecc 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1182,6 +1182,10 @@ static int amd_pstate_power_supply_notifier(struct notifier_block *nb, if (event != PSY_EVENT_PROP_CHANGED) return NOTIFY_OK; + /* dynamic actions are only applied while platform profile is in balanced */ + if (cpudata->current_profile != PLATFORM_PROFILE_BALANCED) + return 0; + epp = amd_pstate_get_balanced_epp(policy); ret = amd_pstate_set_epp(policy, epp); @@ -1190,12 +1194,77 @@ static int amd_pstate_power_supply_notifier(struct notifier_block *nb, return NOTIFY_OK; } + +static int amd_pstate_profile_probe(void *drvdata, unsigned long *choices) +{ + set_bit(PLATFORM_PROFILE_LOW_POWER, choices); + set_bit(PLATFORM_PROFILE_BALANCED, choices); + set_bit(PLATFORM_PROFILE_PERFORMANCE, choices); + + return 0; +} + +static int amd_pstate_profile_get(struct device *dev, + enum platform_profile_option *profile) +{ + struct amd_cpudata *cpudata = dev_get_drvdata(dev); + + *profile = cpudata->current_profile; + + return 0; +} + +static int amd_pstate_profile_set(struct device *dev, + enum platform_profile_option profile) +{ + struct amd_cpudata *cpudata = dev_get_drvdata(dev); + struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); + int ret; + + switch (profile) { + case PLATFORM_PROFILE_LOW_POWER: + ret = amd_pstate_set_epp(policy, AMD_CPPC_EPP_POWERSAVE); + if (ret) + return ret; + break; + case PLATFORM_PROFILE_BALANCED: + ret = amd_pstate_set_epp(policy, + amd_pstate_get_balanced_epp(policy)); + if (ret) + return ret; + break; + case PLATFORM_PROFILE_PERFORMANCE: + ret = amd_pstate_set_epp(policy, AMD_CPPC_EPP_PERFORMANCE); + if (ret) + return ret; + break; + default: + pr_err("Unknown Platform Profile %d\n", profile); + return -EOPNOTSUPP; + } + + cpudata->current_profile = profile; + + return 0; +} + +static const struct platform_profile_ops amd_pstate_profile_ops = { + .probe = amd_pstate_profile_probe, + .profile_set = amd_pstate_profile_set, + .profile_get = amd_pstate_profile_get, +}; + static void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; if (cpudata->power_nb.notifier_call) power_supply_unreg_notifier(&cpudata->power_nb); + if (cpudata->ppdev) { + platform_profile_remove(cpudata->ppdev); + cpudata->ppdev = NULL; + } + kfree(cpudata->profile_name); cpudata->dynamic_epp = false; } @@ -1205,11 +1274,35 @@ static int amd_pstate_set_dynamic_epp(struct cpufreq_policy *policy) int ret; u8 epp; - epp = amd_pstate_get_balanced_epp(policy); + switch (cpudata->current_profile) { + case PLATFORM_PROFILE_PERFORMANCE: + epp = AMD_CPPC_EPP_PERFORMANCE; + break; + case PLATFORM_PROFILE_LOW_POWER: + epp = AMD_CPPC_EPP_POWERSAVE; + break; + case PLATFORM_PROFILE_BALANCED: + epp = amd_pstate_get_balanced_epp(policy); + break; + default: + pr_err("Unknown Platform Profile %d\n", cpudata->current_profile); + return -EOPNOTSUPP; + } ret = amd_pstate_set_epp(policy, epp); if (ret) return ret; + cpudata->profile_name = kasprintf(GFP_KERNEL, "amd-pstate-epp-cpu%d", cpudata->cpu); + + cpudata->ppdev = platform_profile_register(get_cpu_device(policy->cpu), + cpudata->profile_name, + policy->driver_data, + &amd_pstate_profile_ops); + if (IS_ERR(cpudata->ppdev)) { + ret = PTR_ERR(cpudata->ppdev); + goto cleanup; + } + /* only enable notifier if things will actually change */ if (cpudata->epp_default_ac != cpudata->epp_default_dc) { cpudata->power_nb.notifier_call = amd_pstate_power_supply_notifier; @@ -1310,8 +1403,8 @@ static ssize_t show_energy_performance_available_preferences( return offset; } -static ssize_t store_energy_performance_preference( - struct cpufreq_policy *policy, const char *buf, size_t count) +static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy, + const char *buf, size_t count) { struct amd_cpudata *cpudata = policy->driver_data; ssize_t ret; @@ -1331,7 +1424,7 @@ static ssize_t store_energy_performance_preference( else epp = amd_pstate_get_balanced_epp(policy); - if (epp > 0 && policy->policy == CPUFREQ_POLICY_PERFORMANCE) { + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { pr_debug("EPP cannot be set under performance policy\n"); return -EBUSY; } @@ -1343,8 +1436,7 @@ static ssize_t store_energy_performance_preference( return ret ? ret : count; } -static ssize_t show_energy_performance_preference( - struct cpufreq_policy *policy, char *buf) +static ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, char *buf) { struct amd_cpudata *cpudata = policy->driver_data; u8 preference, epp; @@ -1826,10 +1918,12 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) amd_pstate_acpi_pm_profile_undefined()) { policy->policy = CPUFREQ_POLICY_PERFORMANCE; cpudata->epp_default_ac = cpudata->epp_default_dc = amd_pstate_get_epp(cpudata); + cpudata->current_profile = PLATFORM_PROFILE_PERFORMANCE; } else { policy->policy = CPUFREQ_POLICY_POWERSAVE; cpudata->epp_default_ac = AMD_CPPC_EPP_PERFORMANCE; cpudata->epp_default_dc = AMD_CPPC_EPP_BALANCE_PERFORMANCE; + cpudata->current_profile = PLATFORM_PROFILE_BALANCED; } if (dynamic_epp) diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index d929ae3163b3..a7e52f79a802 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -9,6 +9,7 @@ #define _LINUX_AMD_PSTATE_H #include +#include /********************************************************************* * AMD P-state INTERFACE * @@ -127,6 +128,11 @@ struct amd_cpudata { u8 epp_default_dc; bool dynamic_epp; struct notifier_block power_nb; + + /* platform profile */ + enum platform_profile_option current_profile; + struct device *ppdev; + char *profile_name; }; /* -- cgit v1.2.3 From 6927f21852f38db2975b5d5539cbe5241c25a99b Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sun, 29 Mar 2026 15:38:10 -0500 Subject: cpufreq/amd-pstate: Add support for raw EPP writes The energy performance preference field of the CPPC request MSR supports values from 0 to 255, but the strings only offer 4 values. The other values are useful for tuning the performance of some workloads. Add support for writing the raw energy performance preference value to the sysfs file. If the last value written was an integer then an integer will be returned. If the last value written was a string then a string will be returned. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- Documentation/admin-guide/pm/amd-pstate.rst | 16 +++++++++---- drivers/cpufreq/amd-pstate.c | 36 +++++++++++++++++++++-------- drivers/cpufreq/amd-pstate.h | 1 + 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index d68ddfea6a9d..f8e7050fc762 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -316,16 +316,22 @@ A list of all the supported EPP preferences that could be used for These profiles represent different hints that are provided to the low-level firmware about the user's desired energy vs efficiency tradeoff. ``default`` represents the epp value is set by platform -firmware. This attribute is read-only. +firmware. ``custom`` designates that integer values 0-255 may be written +as well. This attribute is read-only. ``energy_performance_preference`` The current energy performance preference can be read from this attribute. and user can change current preference according to energy or performance needs -Please get all support profiles list from -``energy_performance_available_preferences`` attribute, all the profiles are -integer values defined between 0 to 255 when EPP feature is enabled by platform -firmware, but if the dynamic EPP feature is enabled, driver will block writes. +Coarse named profiles are available in the attribute +``energy_performance_available_preferences``. +Users can also write individual integer values between 0 to 255. +When dynamic EPP is enabled, writes to energy_performance_preference are blocked +even when EPP feature is enabled by platform firmware. Lower epp values shift the bias +towards improved performance while a higher epp value shifts the bias towards +power-savings. The exact impact can change from one platform to the other. +If a valid integer was last written, then a number will be returned on future reads. +If a valid string was last written then a string will be returned on future reads. This attribute is read-write. ``boost`` diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d553bbd3fecc..83e937764fab 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -109,6 +109,7 @@ static struct quirk_entry *quirks; * 2 balance_performance * 3 balance_power * 4 power + * 5 custom (for raw EPP values) */ enum energy_perf_value_index { EPP_INDEX_DEFAULT = 0, @@ -116,6 +117,7 @@ enum energy_perf_value_index { EPP_INDEX_BALANCE_PERFORMANCE, EPP_INDEX_BALANCE_POWERSAVE, EPP_INDEX_POWERSAVE, + EPP_INDEX_CUSTOM, EPP_INDEX_MAX, }; @@ -125,6 +127,7 @@ static const char * const energy_perf_strings[] = { [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance", [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", [EPP_INDEX_POWERSAVE] = "power", + [EPP_INDEX_CUSTOM] = "custom", }; static_assert(ARRAY_SIZE(energy_perf_strings) == EPP_INDEX_MAX); @@ -135,7 +138,7 @@ static unsigned int epp_values[] = { [EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE, [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE, }; -static_assert(ARRAY_SIZE(epp_values) == EPP_INDEX_MAX); +static_assert(ARRAY_SIZE(epp_values) == EPP_INDEX_MAX - 1); typedef int (*cppc_mode_transition_fn)(int); @@ -1408,6 +1411,7 @@ static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy { struct amd_cpudata *cpudata = policy->driver_data; ssize_t ret; + bool raw_epp = false; u8 epp; if (cpudata->dynamic_epp) { @@ -1415,14 +1419,21 @@ static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy return -EBUSY; } - ret = sysfs_match_string(energy_perf_strings, buf); - if (ret < 0) - return -EINVAL; - - if (ret) - epp = epp_values[ret]; - else - epp = amd_pstate_get_balanced_epp(policy); + /* + * if the value matches a number, use that, otherwise see if + * matches an index in the energy_perf_strings array + */ + ret = kstrtou8(buf, 0, &epp); + raw_epp = !ret; + if (ret) { + ret = sysfs_match_string(energy_perf_strings, buf); + if (ret < 0 || ret == EPP_INDEX_CUSTOM) + return -EINVAL; + if (ret) + epp = epp_values[ret]; + else + epp = amd_pstate_get_balanced_epp(policy); + } if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { pr_debug("EPP cannot be set under performance policy\n"); @@ -1433,7 +1444,9 @@ static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy if (ret) return ret; - return ret ? ret : count; + cpudata->raw_epp = raw_epp; + + return count; } static ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, char *buf) @@ -1443,6 +1456,9 @@ static ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, epp = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached); + if (cpudata->raw_epp) + return sysfs_emit(buf, "%u\n", epp); + switch (epp) { case AMD_CPPC_EPP_PERFORMANCE: preference = EPP_INDEX_PERFORMANCE; diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index a7e52f79a802..f7461d1b6bf3 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -127,6 +127,7 @@ struct amd_cpudata { u8 epp_default_ac; u8 epp_default_dc; bool dynamic_epp; + bool raw_epp; struct notifier_block power_nb; /* platform profile */ -- cgit v1.2.3 From 7e173bc310d2b1df018edc66334a5304305889a2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sun, 29 Mar 2026 15:38:11 -0500 Subject: cpufreq/amd-pstate-ut: Add a unit test for raw EPP Ensure that all supported raw EPP values work properly. Export the driver helpers used by the test module so the test can drive raw EPP writes and temporarily disable dynamic EPP while it runs. Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate-ut.c | 109 ++++++++++++++++++++++++++++++++++++++++ drivers/cpufreq/amd-pstate.c | 11 ++-- drivers/cpufreq/amd-pstate.h | 4 ++ 3 files changed, 120 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 1f62ab6438b4..aa8a464fab47 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ static char *test_list; module_param(test_list, charp, 0444); MODULE_PARM_DESC(test_list, "Comma-delimited list of tests to run (empty means run all tests)"); +DEFINE_FREE(cleanup_page, void *, if (_T) free_page((unsigned long)_T)) struct amd_pstate_ut_struct { const char *name; @@ -54,6 +56,7 @@ static int amd_pstate_ut_acpi_cpc_valid(u32 index); static int amd_pstate_ut_check_enabled(u32 index); static int amd_pstate_ut_check_perf(u32 index); static int amd_pstate_ut_check_freq(u32 index); +static int amd_pstate_ut_epp(u32 index); static int amd_pstate_ut_check_driver(u32 index); static int amd_pstate_ut_check_freq_attrs(u32 index); @@ -62,6 +65,7 @@ static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = { {"amd_pstate_ut_check_enabled", amd_pstate_ut_check_enabled }, {"amd_pstate_ut_check_perf", amd_pstate_ut_check_perf }, {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq }, + {"amd_pstate_ut_epp", amd_pstate_ut_epp }, {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver }, {"amd_pstate_ut_check_freq_attrs", amd_pstate_ut_check_freq_attrs }, }; @@ -268,6 +272,111 @@ static int amd_pstate_set_mode(enum amd_pstate_mode mode) return amd_pstate_update_status(mode_str, strlen(mode_str)); } +static int amd_pstate_ut_epp(u32 index) +{ + struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; + char *buf __free(cleanup_page) = NULL; + static const char * const epp_strings[] = { + "performance", + "balance_performance", + "balance_power", + "power", + }; + struct amd_cpudata *cpudata; + enum amd_pstate_mode orig_mode; + bool orig_dynamic_epp; + int ret, cpu = 0; + int i; + u16 epp; + + policy = cpufreq_cpu_get(cpu); + if (!policy) + return -ENODEV; + + cpudata = policy->driver_data; + orig_mode = amd_pstate_get_status(); + orig_dynamic_epp = cpudata->dynamic_epp; + + /* disable dynamic EPP before running test */ + if (cpudata->dynamic_epp) { + pr_debug("Dynamic EPP is enabled, disabling it\n"); + amd_pstate_clear_dynamic_epp(policy); + } + + buf = (char *)__get_free_page(GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ret = amd_pstate_set_mode(AMD_PSTATE_ACTIVE); + if (ret) + goto out; + + for (epp = 0; epp <= U8_MAX; epp++) { + u8 val; + + /* write all EPP values */ + memset(buf, 0, PAGE_SIZE); + snprintf(buf, PAGE_SIZE, "%d", epp); + ret = store_energy_performance_preference(policy, buf, strlen(buf)); + if (ret < 0) + goto out; + + /* check if the EPP value reads back correctly for raw numbers */ + memset(buf, 0, PAGE_SIZE); + ret = show_energy_performance_preference(policy, buf); + if (ret < 0) + goto out; + strreplace(buf, '\n', '\0'); + ret = kstrtou8(buf, 0, &val); + if (!ret && epp != val) { + pr_err("Raw EPP value mismatch: %d != %d\n", epp, val); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < ARRAY_SIZE(epp_strings); i++) { + memset(buf, 0, PAGE_SIZE); + snprintf(buf, PAGE_SIZE, "%s", epp_strings[i]); + ret = store_energy_performance_preference(policy, buf, strlen(buf)); + if (ret < 0) + goto out; + + memset(buf, 0, PAGE_SIZE); + ret = show_energy_performance_preference(policy, buf); + if (ret < 0) + goto out; + strreplace(buf, '\n', '\0'); + + if (strcmp(buf, epp_strings[i])) { + pr_err("String EPP value mismatch: %s != %s\n", buf, epp_strings[i]); + ret = -EINVAL; + goto out; + } + } + + ret = 0; + +out: + if (orig_dynamic_epp) { + int ret2; + + ret2 = amd_pstate_set_mode(AMD_PSTATE_DISABLE); + if (!ret && ret2) + ret = ret2; + } + + if (orig_mode != amd_pstate_get_status()) { + int ret2; + + ret2 = amd_pstate_set_mode(orig_mode); + if (!ret && ret2) + ret = ret2; + } + + return ret; +} + static int amd_pstate_ut_check_driver(u32 index) { enum amd_pstate_mode mode1, mode2 = AMD_PSTATE_DISABLE; diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 83e937764fab..ca593c209111 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1257,7 +1257,7 @@ static const struct platform_profile_ops amd_pstate_profile_ops = { .profile_get = amd_pstate_profile_get, }; -static void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy) +void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; @@ -1270,6 +1270,7 @@ static void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy) kfree(cpudata->profile_name); cpudata->dynamic_epp = false; } +EXPORT_SYMBOL_GPL(amd_pstate_clear_dynamic_epp); static int amd_pstate_set_dynamic_epp(struct cpufreq_policy *policy) { @@ -1406,8 +1407,8 @@ static ssize_t show_energy_performance_available_preferences( return offset; } -static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy, - const char *buf, size_t count) +ssize_t store_energy_performance_preference(struct cpufreq_policy *policy, + const char *buf, size_t count) { struct amd_cpudata *cpudata = policy->driver_data; ssize_t ret; @@ -1448,8 +1449,9 @@ static ssize_t store_energy_performance_preference(struct cpufreq_policy *policy return count; } +EXPORT_SYMBOL_GPL(store_energy_performance_preference); -static ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, char *buf) +ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, char *buf) { struct amd_cpudata *cpudata = policy->driver_data; u8 preference, epp; @@ -1478,6 +1480,7 @@ static ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); } +EXPORT_SYMBOL_GPL(show_energy_performance_preference); static ssize_t store_amd_pstate_floor_freq(struct cpufreq_policy *policy, const char *buf, size_t count) diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index f7461d1b6bf3..e4722e54387b 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -150,6 +150,10 @@ enum amd_pstate_mode { const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode); int amd_pstate_get_status(void); int amd_pstate_update_status(const char *buf, size_t size); +ssize_t store_energy_performance_preference(struct cpufreq_policy *policy, + const char *buf, size_t count); +ssize_t show_energy_performance_preference(struct cpufreq_policy *policy, char *buf); +void amd_pstate_clear_dynamic_epp(struct cpufreq_policy *policy); struct freq_attr; -- cgit v1.2.3 From 86d71f1d7686cecebbafb371ad58c6ad7f80a93a Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 16 Mar 2026 08:18:48 +0000 Subject: cpufreq/amd-pstate: Pass the policy to amd_pstate_update() All callers of amd_pstate_update() already have a reference to the cpufreq_policy object. Pass the entire policy object and grab the cpudata using "policy->driver_data" instead of passing the cpudata and unnecessarily grabbing another read-side reference to the cpufreq policy object when it is already available in the caller. No functional changes intended. Reviewed-by: Mario Limonciello (AMD) Acked-by: Viresh Kumar Signed-off-by: K Prateek Nayak Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20260316081849.19368-2-kprateek.nayak@amd.com Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ca593c209111..2ea4d27fe020 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -643,15 +643,12 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) return true; } -static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, +static void amd_pstate_update(struct cpufreq_policy *policy, u8 min_perf, u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags) { - struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); + struct amd_cpudata *cpudata = policy->driver_data; union perf_cached perf = READ_ONCE(cpudata->perf); - if (!policy) - return; - /* limit the max perf when core performance boost feature is disabled */ if (!cpudata->boost_supported) max_perf = min_t(u8, perf.nominal_perf, max_perf); @@ -766,7 +763,7 @@ static int amd_pstate_update_freq(struct cpufreq_policy *policy, if (!fast_switch) cpufreq_freq_transition_begin(policy, &freqs); - amd_pstate_update(cpudata, perf.min_limit_perf, des_perf, + amd_pstate_update(policy, perf.min_limit_perf, des_perf, perf.max_limit_perf, fast_switch, policy->governor->flags); @@ -828,7 +825,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (max_perf < min_perf) max_perf = min_perf; - amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, + amd_pstate_update(policy, min_perf, des_perf, max_perf, true, policy->governor->flags); } -- cgit v1.2.3 From c03791085adcd61fa9b766ab303c7d0941d7378d Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 16 Mar 2026 08:18:49 +0000 Subject: cpufreq: Pass the policy to cpufreq_driver->adjust_perf() cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent writer(s), however amd-pstate depends on fetching the cpudata via the policy's driver data which necessitates grabbing the reference. Since schedutil governor can call "cpufreq_driver->update_perf()" during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled, fetching the policy object using the cpufreq_cpu_get() helper in the scheduler fast-path leads to "BUG: scheduling while atomic" on PREEMPT_RT [1]. Pass the cached cpufreq policy object in sg_policy to the update_perf() instead of just the CPU. The CPU can be inferred using "policy->cpu". The lifetime of cpufreq_policy object outlasts that of the governor and the cpufreq driver (allocated when the CPU is onlined and only reclaimed when the CPU is offlined / the CPU device is removed) which makes it safe to be referenced throughout the governor's lifetime. Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1] Fixes: 1d215f0319c2 ("cpufreq: amd-pstate: Add fast switch function for AMD P-State") Reported-by: Bert Karwatzki Acked-by: Viresh Kumar Signed-off-by: K Prateek Nayak Acked-by: Gary Guo # Rust Reviewed-by: Gautham R. Shenoy Reviewed-by: Zhongqiu Han Link: https://lore.kernel.org/r/20260316081849.19368-3-kprateek.nayak@amd.com Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 3 +-- drivers/cpufreq/cpufreq.c | 6 +++--- drivers/cpufreq/intel_pstate.c | 4 ++-- include/linux/cpufreq.h | 4 ++-- kernel/sched/cpufreq_schedutil.c | 5 +++-- rust/kernel/cpufreq.rs | 13 ++++++------- 6 files changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 2ea4d27fe020..c825fab0bf5c 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -788,13 +788,12 @@ static unsigned int amd_pstate_fast_switch(struct cpufreq_policy *policy, return policy->cur; } -static void amd_pstate_adjust_perf(unsigned int cpu, +static void amd_pstate_adjust_perf(struct cpufreq_policy *policy, unsigned long _min_perf, unsigned long target_perf, unsigned long capacity) { u8 max_perf, min_perf, des_perf, cap_perf; - struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); struct amd_cpudata *cpudata; union perf_cached perf; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 277884d91913..90e939069cde 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2231,7 +2231,7 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); /** * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go. - * @cpu: Target CPU. + * @policy: cpufreq policy object of the target CPU. * @min_perf: Minimum (required) performance level (units of @capacity). * @target_perf: Target (desired) performance level (units of @capacity). * @capacity: Capacity of the target CPU. @@ -2250,12 +2250,12 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); * parallel with either ->target() or ->target_index() or ->fast_switch() for * the same CPU. */ -void cpufreq_driver_adjust_perf(unsigned int cpu, +void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity) { - cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity); + cpufreq_driver->adjust_perf(policy, min_perf, target_perf, capacity); } /** diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 11c58af41900..0f50034e4b68 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3239,12 +3239,12 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, return target_pstate * cpu->pstate.scaling; } -static void intel_cpufreq_adjust_perf(unsigned int cpunum, +static void intel_cpufreq_adjust_perf(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity) { - struct cpudata *cpu = all_cpu_data[cpunum]; + struct cpudata *cpu = all_cpu_data[policy->cpu]; u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached); int old_pstate = cpu->pstate.current_pstate; int cap_pstate, min_pstate, max_pstate, target_pstate; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cc894fc38971..4317c5a312bd 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -372,7 +372,7 @@ struct cpufreq_driver { * conditions) scale invariance can be disabled, which causes the * schedutil governor to fall back to the latter. */ - void (*adjust_perf)(unsigned int cpu, + void (*adjust_perf)(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity); @@ -617,7 +617,7 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); -void cpufreq_driver_adjust_perf(unsigned int cpu, +void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy, unsigned long min_perf, unsigned long target_perf, unsigned long capacity); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 153232dd8276..ae9fd211cec1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -461,6 +461,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long prev_util = sg_cpu->util; unsigned long max_cap; @@ -482,10 +483,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; - cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, + cpufreq_driver_adjust_perf(sg_policy->policy, sg_cpu->bw_min, sg_cpu->util, max_cap); - sg_cpu->sg_policy->last_freq_update_time = time; + sg_policy->last_freq_update_time = time; } static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index f5adee48d40c..d8d26870bea2 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -1257,18 +1257,17 @@ impl Registration { /// # Safety /// /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. unsafe extern "C" fn adjust_perf_callback( - cpu: c_uint, + ptr: *mut bindings::cpufreq_policy, min_perf: c_ulong, target_perf: c_ulong, capacity: c_ulong, ) { - // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. - let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) }; - - if let Ok(mut policy) = PolicyCpu::from_cpu(cpu_id) { - T::adjust_perf(&mut policy, min_perf, target_perf, capacity); - } + // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the + // lifetime of `policy`. + let policy = unsafe { Policy::from_raw_mut(ptr) }; + T::adjust_perf(policy, min_perf, target_perf, capacity); } /// Driver's `get_intermediate` callback. -- cgit v1.2.3 From 9487e2a00e7b3c6f258c5c99953f470eba6fb61d Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 2 Apr 2026 15:56:11 +0530 Subject: MAINTAINERS: amd-pstate: Step down as maintainer, add Prateek as reviewer Mario Limonciello has led amd-pstate maintenance in recent years and has done excellent work. The amd-pstate driver is in good hands with him. I am stepping down as co-maintainer as I move on to other things. Add K Prateek Nayak as a reviewer. He has been actively contributing to the driver including preferred-core and ITMT improvements, and has been helping review amd-pstate patches for a while now. Signed-off-by: Gautham R. Shenoy Acked-by: K Prateek Nayak Acked-by: Mario Limonciello (AMD) Link: https://lore.kernel.org/r/20260402102611.16519-1-gautham.shenoy@amd.com Signed-off-by: Mario Limonciello (AMD) --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7d10988cbc62..50723bc3e69e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1234,9 +1234,9 @@ F: drivers/gpu/drm/amd/pm/ AMD PSTATE DRIVER M: Huang Rui -M: Gautham R. Shenoy M: Mario Limonciello R: Perry Yuan +R: K Prateek Nayak L: linux-pm@vger.kernel.org S: Supported F: Documentation/admin-guide/pm/amd-pstate.rst -- cgit v1.2.3 From 82bb8dc953c2203bb710b7358f030b1d9bdf698d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 14 Jan 2026 10:31:15 +0100 Subject: PM / devfreq: Remove unneeded casting for HZ_PER_KHZ HZ_PER_KHZ is defined as UL (unsigned long), no need to repeat that. Signed-off-by: Andy Shevchenko Reviewed-by: Lifeng Zheng Acked-by: MyungJoo Ham Signed-off-by: Chanwoo Choi Link: https://lore.kernel.org/lkml/20260114093115.276818-1-andriy.shevchenko@linux.intel.com/ --- drivers/devfreq/devfreq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index c0a74091b904..54f0b18536db 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -146,10 +146,9 @@ void devfreq_get_freq_range(struct devfreq *devfreq, DEV_PM_QOS_MIN_FREQUENCY); qos_max_freq = dev_pm_qos_read_value(devfreq->dev.parent, DEV_PM_QOS_MAX_FREQUENCY); - *min_freq = max(*min_freq, (unsigned long)HZ_PER_KHZ * qos_min_freq); + *min_freq = max(*min_freq, HZ_PER_KHZ * qos_min_freq); if (qos_max_freq != PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE) - *max_freq = min(*max_freq, - (unsigned long)HZ_PER_KHZ * qos_max_freq); + *max_freq = min(*max_freq, HZ_PER_KHZ * qos_max_freq); /* Apply constraints from OPP interface */ *max_freq = clamp(*max_freq, devfreq->scaling_min_freq, devfreq->scaling_max_freq); -- cgit v1.2.3 From 943a872fe41a8352d64b20de77d8b707978e5732 Mon Sep 17 00:00:00 2001 From: Pengjie Zhang Date: Tue, 16 Dec 2025 11:11:53 +0800 Subject: PM / devfreq: use _visible attribute to replace create/remove_sysfs_files() Previously, non-generic attributes (polling_interval, timer) used separate create/delete logic, leading to race conditions during concurrent access in creation/deletion. Multi-threaded operations also caused inconsistencies between governor capabilities and attribute states. 1.Use is_visible + sysfs_update_group() to unify management of these attributes, eliminating creation/deletion races. 2.Add locks and validation to these attributes, ensuring consistency between current governor capabilities and attribute operations in multi-threaded environments. Reviewed-by: Jonathan Cameron Reviewed-by: Jie Zhan Signed-off-by: Pengjie Zhang Signed-off-by: Chanwoo Choi Link: https://www.spinics.net/lists/kernel/msg5967745.html --- drivers/devfreq/devfreq.c | 103 +++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 43 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 54f0b18536db..82dd9a43dc62 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -38,6 +38,7 @@ static struct class *devfreq_class; static struct dentry *devfreq_debugfs; +static const struct attribute_group gov_attr_group; /* * devfreq core provides delayed work based load monitoring helper @@ -784,11 +785,6 @@ static void devfreq_dev_release(struct device *dev) kfree(devfreq); } -static void create_sysfs_files(struct devfreq *devfreq, - const struct devfreq_governor *gov); -static void remove_sysfs_files(struct devfreq *devfreq, - const struct devfreq_governor *gov); - /** * devfreq_add_device() - Add devfreq feature to the device * @dev: the device to add devfreq feature. @@ -955,7 +951,10 @@ struct devfreq *devfreq_add_device(struct device *dev, __func__); goto err_init; } - create_sysfs_files(devfreq, devfreq->governor); + + err = sysfs_update_group(&devfreq->dev.kobj, &gov_attr_group); + if (err) + goto err_init; list_add(&devfreq->node, &devfreq_list); @@ -994,12 +993,9 @@ int devfreq_remove_device(struct devfreq *devfreq) devfreq_cooling_unregister(devfreq->cdev); - if (devfreq->governor) { + if (devfreq->governor) devfreq->governor->event_handler(devfreq, DEVFREQ_GOV_STOP, NULL); - remove_sysfs_files(devfreq, devfreq->governor); - } - device_unregister(&devfreq->dev); return 0; @@ -1459,7 +1455,6 @@ static ssize_t governor_store(struct device *dev, struct device_attribute *attr, __func__, df->governor->name, ret); goto out; } - remove_sysfs_files(df, df->governor); /* * Start the new governor and create the specific sysfs files @@ -1488,7 +1483,7 @@ static ssize_t governor_store(struct device *dev, struct device_attribute *attr, * Create the sysfs files for the new governor. But if failed to start * the new governor, restore the sysfs files of previous governor. */ - create_sysfs_files(df, df->governor); + ret = sysfs_update_group(&df->dev.kobj, &gov_attr_group); out: mutex_unlock(&devfreq_list_lock); @@ -1806,14 +1801,17 @@ static struct attribute *devfreq_attrs[] = { &dev_attr_trans_stat.attr, NULL, }; -ATTRIBUTE_GROUPS(devfreq); static ssize_t polling_interval_show(struct device *dev, struct device_attribute *attr, char *buf) { struct devfreq *df = to_devfreq(dev); - if (!df->profile) + /* Protect against race between sysfs attrs update and read/write */ + guard(mutex)(&devfreq_list_lock); + + if (!df->profile || !df->governor || + !IS_SUPPORTED_ATTR(df->governor->attrs, POLLING_INTERVAL)) return -EINVAL; return sprintf(buf, "%d\n", df->profile->polling_ms); @@ -1827,7 +1825,10 @@ static ssize_t polling_interval_store(struct device *dev, unsigned int value; int ret; - if (!df->governor) + guard(mutex)(&devfreq_list_lock); + + if (!df->governor || + !IS_SUPPORTED_ATTR(df->governor->attrs, POLLING_INTERVAL)) return -EINVAL; ret = sscanf(buf, "%u", &value); @@ -1846,7 +1847,10 @@ static ssize_t timer_show(struct device *dev, { struct devfreq *df = to_devfreq(dev); - if (!df->profile) + guard(mutex)(&devfreq_list_lock); + + if (!df->profile || !df->governor || + !IS_SUPPORTED_ATTR(df->governor->attrs, TIMER)) return -EINVAL; return sprintf(buf, "%s\n", timer_name[df->profile->timer]); @@ -1860,7 +1864,10 @@ static ssize_t timer_store(struct device *dev, struct device_attribute *attr, int timer = -1; int ret = 0, i; - if (!df->governor || !df->profile) + guard(mutex)(&devfreq_list_lock); + + if (!df->governor || !df->profile || + !IS_SUPPORTED_ATTR(df->governor->attrs, TIMER)) return -EINVAL; ret = sscanf(buf, "%16s", str_timer); @@ -1904,37 +1911,47 @@ out: } static DEVICE_ATTR_RW(timer); -#define CREATE_SYSFS_FILE(df, name) \ -{ \ - int ret; \ - ret = sysfs_create_file(&df->dev.kobj, &dev_attr_##name.attr); \ - if (ret < 0) { \ - dev_warn(&df->dev, \ - "Unable to create attr(%s)\n", "##name"); \ - } \ -} \ +static struct attribute *governor_attrs[] = { + &dev_attr_polling_interval.attr, + &dev_attr_timer.attr, + NULL +}; -/* Create the specific sysfs files which depend on each governor. */ -static void create_sysfs_files(struct devfreq *devfreq, - const struct devfreq_governor *gov) +static umode_t gov_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) { - if (IS_SUPPORTED_ATTR(gov->attrs, POLLING_INTERVAL)) - CREATE_SYSFS_FILE(devfreq, polling_interval); - if (IS_SUPPORTED_ATTR(gov->attrs, TIMER)) - CREATE_SYSFS_FILE(devfreq, timer); -} + struct device *dev = kobj_to_dev(kobj); + struct devfreq *df = to_devfreq(dev); -/* Remove the specific sysfs files which depend on each governor. */ -static void remove_sysfs_files(struct devfreq *devfreq, - const struct devfreq_governor *gov) -{ - if (IS_SUPPORTED_ATTR(gov->attrs, POLLING_INTERVAL)) - sysfs_remove_file(&devfreq->dev.kobj, - &dev_attr_polling_interval.attr); - if (IS_SUPPORTED_ATTR(gov->attrs, TIMER)) - sysfs_remove_file(&devfreq->dev.kobj, &dev_attr_timer.attr); + if (!df->governor || !df->governor->attrs) + return 0; + + if (attr == &dev_attr_polling_interval.attr && + IS_SUPPORTED_ATTR(df->governor->attrs, POLLING_INTERVAL)) + return attr->mode; + + if (attr == &dev_attr_timer.attr && + IS_SUPPORTED_ATTR(df->governor->attrs, TIMER)) + return attr->mode; + + return 0; } +static const struct attribute_group devfreq_group = { + .attrs = devfreq_attrs, +}; + +static const struct attribute_group gov_attr_group = { + .attrs = governor_attrs, + .is_visible = gov_attr_visible, +}; + +static const struct attribute_group *devfreq_groups[] = { + &devfreq_group, + &gov_attr_group, + NULL +}; + /** * devfreq_summary_show() - Show the summary of the devfreq devices * @s: seq_file instance to show the summary of devfreq devices -- cgit v1.2.3 From cd905830ea6184d6678386ce2d652bec324034d1 Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Mon, 26 Jan 2026 20:54:23 +0200 Subject: PM / devfreq: tegra30-devfreq: add support for Tegra114 Lets add Tegra114 support to activity monitor device as a preparation to upcoming EMC controller support. Signed-off-by: Svyatoslav Ryhel Reviewed-by: Mikko Perttunen Acked-by: Dmitry Osipenko Acked-by: MyungJoo Ham Signed-off-by: Chanwoo Choi Link: https://lore.kernel.org/lkml/20260126185423.77786-1-clamor95@gmail.com/ --- drivers/devfreq/tegra30-devfreq.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/devfreq/tegra30-devfreq.c b/drivers/devfreq/tegra30-devfreq.c index 8b57194ac698..401aac6a9f07 100644 --- a/drivers/devfreq/tegra30-devfreq.c +++ b/drivers/devfreq/tegra30-devfreq.c @@ -941,16 +941,22 @@ static int tegra_devfreq_probe(struct platform_device *pdev) return 0; } +/* + * The activity counter is incremented every 256 memory transactions. However, + * the number of clock cycles required for each transaction varies across + * different SoC generations. For instance, a single transaction takes 2 EMC + * clocks on Tegra30, 1 EMC clock on Tegra114, and 4 EMC clocks on Tegra124. + */ static const struct tegra_devfreq_soc_data tegra124_soc = { .configs = tegra124_device_configs, - - /* - * Activity counter is incremented every 256 memory transactions, - * and each transaction takes 4 EMC clocks. - */ .count_weight = 4 * 256, }; +static const struct tegra_devfreq_soc_data tegra114_soc = { + .configs = tegra124_device_configs, + .count_weight = 256, +}; + static const struct tegra_devfreq_soc_data tegra30_soc = { .configs = tegra30_device_configs, .count_weight = 2 * 256, @@ -958,6 +964,7 @@ static const struct tegra_devfreq_soc_data tegra30_soc = { static const struct of_device_id tegra_devfreq_of_match[] = { { .compatible = "nvidia,tegra30-actmon", .data = &tegra30_soc, }, + { .compatible = "nvidia,tegra114-actmon", .data = &tegra114_soc, }, { .compatible = "nvidia,tegra124-actmon", .data = &tegra124_soc, }, { }, }; -- cgit v1.2.3 From 629be87e0d6be4c3683d3b39811804f42a78f04b Mon Sep 17 00:00:00 2001 From: Huisong Li Date: Fri, 3 Apr 2026 16:45:42 +0800 Subject: cpuidle: Simplify cpuidle_register_device() with guard() Use guard() macro for mutex to simplify the control flow in cpuidle_register_device(). Signed-off-by: Huisong Li Link: https://patch.msgid.link/20260403084542.708104-1-lihuisong@huawei.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c7876e9e024f..8c037db46792 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -679,16 +679,16 @@ int cpuidle_register_device(struct cpuidle_device *dev) if (!dev) return -EINVAL; - mutex_lock(&cpuidle_lock); + guard(mutex)(&cpuidle_lock); if (dev->registered) - goto out_unlock; + return ret; __cpuidle_device_init(dev); ret = __cpuidle_register_device(dev); if (ret) - goto out_unlock; + return ret; ret = cpuidle_add_sysfs(dev); if (ret) @@ -700,16 +700,14 @@ int cpuidle_register_device(struct cpuidle_device *dev) cpuidle_install_idle_handler(); -out_unlock: - mutex_unlock(&cpuidle_lock); - return ret; out_sysfs: cpuidle_remove_sysfs(dev); out_unregister: __cpuidle_unregister_device(dev); - goto out_unlock; + + return ret; } EXPORT_SYMBOL_GPL(cpuidle_register_device); -- cgit v1.2.3 From 2fd3b83cacfb9160b896fb26117328eeb0598c54 Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Thu, 2 Apr 2026 21:45:32 +0530 Subject: cpupower: remove extern declarations in cmd functions extern char *optarg and extern int optind, opterr, optopt are already declared by , which is included at the top of the file. Repeating extern declarations inside a function body is misleading and unnecessary. Signed-off-by: Kaushlendra Kumar Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/cpufreq-info.c | 2 -- tools/power/cpupower/utils/cpufreq-set.c | 2 -- tools/power/cpupower/utils/cpuidle-info.c | 2 -- tools/power/cpupower/utils/cpuidle-set.c | 2 -- tools/power/cpupower/utils/cpupower-info.c | 2 -- tools/power/cpupower/utils/cpupower-set.c | 2 -- 6 files changed, 12 deletions(-) diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index 5fe01e516817..5a242b491a9d 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -542,8 +542,6 @@ static struct option info_opts[] = { int cmd_freq_info(int argc, char **argv) { - extern char *optarg; - extern int optind, opterr, optopt; int ret = 0, cont = 1; unsigned int cpu = 0; unsigned int human = 0; diff --git a/tools/power/cpupower/utils/cpufreq-set.c b/tools/power/cpupower/utils/cpufreq-set.c index c5e60a39cfa6..06cd4b280132 100644 --- a/tools/power/cpupower/utils/cpufreq-set.c +++ b/tools/power/cpupower/utils/cpufreq-set.c @@ -195,8 +195,6 @@ static int do_one_cpu(unsigned int cpu, struct cpufreq_policy *new_pol, int cmd_freq_set(int argc, char **argv) { - extern char *optarg; - extern int optind, opterr, optopt; int ret = 0, cont = 1; int double_parm = 0, related = 0, policychange = 0; unsigned long freq = 0; diff --git a/tools/power/cpupower/utils/cpuidle-info.c b/tools/power/cpupower/utils/cpuidle-info.c index 81b4763a97d6..ccb37125bd37 100644 --- a/tools/power/cpupower/utils/cpuidle-info.c +++ b/tools/power/cpupower/utils/cpuidle-info.c @@ -139,8 +139,6 @@ static inline void cpuidle_exit(int fail) int cmd_idle_info(int argc, char **argv) { - extern char *optarg; - extern int optind, opterr, optopt; int ret = 0, cont = 1, output_param = 0, verbose = 1; unsigned int cpu = 0; diff --git a/tools/power/cpupower/utils/cpuidle-set.c b/tools/power/cpupower/utils/cpuidle-set.c index a551d1d4ac51..703094f1343c 100644 --- a/tools/power/cpupower/utils/cpuidle-set.c +++ b/tools/power/cpupower/utils/cpuidle-set.c @@ -24,8 +24,6 @@ static struct option info_opts[] = { int cmd_idle_set(int argc, char **argv) { - extern char *optarg; - extern int optind, opterr, optopt; int ret = 0, cont = 1, param = 0, disabled; unsigned long long latency = 0, state_latency; unsigned int cpu = 0, idlestate = 0, idlestates = 0; diff --git a/tools/power/cpupower/utils/cpupower-info.c b/tools/power/cpupower/utils/cpupower-info.c index 18fd7751f509..79154d71e498 100644 --- a/tools/power/cpupower/utils/cpupower-info.c +++ b/tools/power/cpupower/utils/cpupower-info.c @@ -28,8 +28,6 @@ static void print_wrong_arg_exit(void) int cmd_info(int argc, char **argv) { - extern char *optarg; - extern int optind, opterr, optopt; unsigned int cpu; struct utsname uts; diff --git a/tools/power/cpupower/utils/cpupower-set.c b/tools/power/cpupower/utils/cpupower-set.c index 550a942e72ce..c2176b9fa57d 100644 --- a/tools/power/cpupower/utils/cpupower-set.c +++ b/tools/power/cpupower/utils/cpupower-set.c @@ -33,8 +33,6 @@ static void print_wrong_arg_exit(void) int cmd_set(int argc, char **argv) { - extern char *optarg; - extern int optind, opterr, optopt; unsigned int cpu; struct utsname uts; -- cgit v1.2.3 From 679343977588781bd3effba79e9644aee4ee046c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 7 Apr 2026 14:49:49 -0500 Subject: cpufreq/amd-pstate: Add POWER_SUPPLY select for dynamic EPP The dynamic EPP feature uses power_supply_reg_notifier() and power_supply_unreg_notifier() but doesn't declare a dependency on POWER_SUPPLY, causing linker errors when POWER_SUPPLY is not enabled. Add POWER_SUPPLY to the selects. Suggested-by: K Prateek Nayak Fixes: e30ca6dd5345 ("cpufreq/amd-pstate: Add dynamic energy performance preference") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202604040742.ySEdkuAa-lkp@intel.com/ Signed-off-by: Mario Limonciello Link: https://patch.msgid.link/20260407194949.310114-1-mario.limonciello@amd.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig.x86 | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index a0dbb9808ae9..027e6ea2e038 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -41,6 +41,7 @@ config X86_AMD_PSTATE select ACPI_CPPC_LIB if X86_64 select CPU_FREQ_GOV_SCHEDUTIL if SMP select ACPI_PLATFORM_PROFILE + select POWER_SUPPLY help This driver adds a CPUFreq driver which utilizes a fine grain processor performance frequency control range instead of legacy -- cgit v1.2.3