From b017500ab53c06441ff7d3a681484e37039b4f57 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 22 Jan 2024 17:11:26 +0100 Subject: PM: sleep: Use bool for all 1-bit fields in struct dev_pm_info For some 1-bit fields in struct dev_pm_info the data type is bool, while for some other 1-bit fields in there it is unsigned int, and these differences are somewhat arbitrary. For consistency, change the data type of the latter to bool, so that all of the 1-bit fields in struct dev_pm_info fields are bool. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Greg Kroah-Hartman --- include/linux/pm.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/pm.h b/include/linux/pm.h index a2f3e53a8196..97b0e23363c8 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -662,8 +662,8 @@ struct pm_subsys_data { struct dev_pm_info { pm_message_t power_state; - unsigned int can_wakeup:1; - unsigned int async_suspend:1; + bool can_wakeup:1; + bool async_suspend:1; bool in_dpm_list:1; /* Owned by the PM core */ bool is_prepared:1; /* Owned by the PM core */ bool is_suspended:1; /* Ditto */ @@ -682,10 +682,10 @@ struct dev_pm_info { bool syscore:1; bool no_pm_callbacks:1; /* Owned by the PM core */ bool async_in_progress:1; /* Owned by the PM core */ - unsigned int must_resume:1; /* Owned by the PM core */ - unsigned int may_skip_resume:1; /* Set by subsystems */ + bool must_resume:1; /* Owned by the PM core */ + bool may_skip_resume:1; /* Set by subsystems */ #else - unsigned int should_wakeup:1; + bool should_wakeup:1; #endif #ifdef CONFIG_PM struct hrtimer suspend_timer; @@ -696,17 +696,17 @@ struct dev_pm_info { atomic_t usage_count; atomic_t child_count; unsigned int disable_depth:3; - unsigned int idle_notification:1; - unsigned int request_pending:1; - unsigned int deferred_resume:1; - unsigned int needs_force_resume:1; - unsigned int runtime_auto:1; + bool idle_notification:1; + bool request_pending:1; + bool deferred_resume:1; + bool needs_force_resume:1; + bool runtime_auto:1; bool ignore_children:1; - unsigned int no_callbacks:1; - unsigned int irq_safe:1; - unsigned int use_autosuspend:1; - unsigned int timer_autosuspends:1; - unsigned int memalloc_noio:1; + bool no_callbacks:1; + bool irq_safe:1; + bool use_autosuspend:1; + bool timer_autosuspends:1; + bool memalloc_noio:1; unsigned int links_count; enum rpm_request request; enum rpm_status runtime_status; -- cgit v1.2.3 From 12753d71e8c5c3e716cedba23ddeed508da0bdc4 Mon Sep 17 00:00:00 2001 From: Meng Li Date: Fri, 19 Jan 2024 17:04:57 +0800 Subject: ACPI: CPPC: Add helper to get the highest performance value Add support for getting the highest performance to the generic CPPC driver. This enables downstream drivers such as amd-pstate to discover and use these values. Refer to Chapter 8.4.6.1.1.1. Highest Performance of ACPI Specification 6.5 for details on continuous performance control of CPPC (linked below). Tested-by: Oleksandr Natalenko Reviewed-by: Mario Limonciello Reviewed-by: Wyes Karny Reviewed-by: Perry Yuan Acked-by: Huang Rui Signed-off-by: Meng Li Link: https://uefi.org/specs/ACPI/6.5/08_Processor_Configuration_and_Control.html?highlight=cppc#highest-performance [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/acpi/cppc_acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 3a0995f8bce8..930b6afba6f4 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -139,6 +139,7 @@ struct cppc_cpudata { #ifdef CONFIG_ACPI_CPPC_LIB extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf); +extern int cppc_get_highest_perf(int cpunum, u64 *highest_perf); extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_set_enable(int cpu, bool enable); @@ -167,6 +168,10 @@ static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) { return -ENOTSUPP; } +static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf) +{ + return -ENOTSUPP; +} static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs) { return -ENOTSUPP; -- cgit v1.2.3 From f3a052391822b772b4e27f2594526cf1eb103cab Mon Sep 17 00:00:00 2001 From: Meng Li Date: Fri, 19 Jan 2024 17:04:58 +0800 Subject: cpufreq: amd-pstate: Enable amd-pstate preferred core support amd-pstate driver utilizes the functions and data structures provided by the ITMT architecture to enable the scheduler to favor scheduling on cores which can be get a higher frequency with lower voltage. We call it amd-pstate preferrred core. Here sched_set_itmt_core_prio() is called to set priorities and sched_set_itmt_support() is called to enable ITMT feature. amd-pstate driver uses the highest performance value to indicate the priority of CPU. The higher value has a higher priority. The initial core rankings are set up by amd-pstate when the system boots. Add a variable hw_prefcore in cpudata structure. It will check if the processor and power firmware support preferred core feature. Add one new early parameter `disable` to allow user to disable the preferred core. Only when hardware supports preferred core and user set `enabled` in early parameter, amd pstate driver supports preferred core featue. Tested-by: Oleksandr Natalenko Reviewed-by: Huang Rui Reviewed-by: Wyes Karny Reviewed-by: Mario Limonciello Co-developed-by: Perry Yuan Signed-off-by: Perry Yuan Signed-off-by: Meng Li Signed-off-by: Rafael J. Wysocki --- include/linux/amd-pstate.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index 6ad02ad9c7b4..68fc1bd8d851 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -52,6 +52,9 @@ struct amd_aperf_mperf { * @prev: Last Aperf/Mperf/tsc count value read from register * @freq: current cpu frequency value * @boost_supported: check whether the Processor or SBIOS supports boost mode + * @hw_prefcore: check whether HW supports preferred core featue. + * Only when hw_prefcore and early prefcore param are true, + * AMD P-State driver supports preferred core featue. * @epp_policy: Last saved policy used to set energy-performance preference * @epp_cached: Cached CPPC energy-performance preference value * @policy: Cpufreq policy value @@ -85,6 +88,7 @@ struct amd_cpudata { u64 freq; bool boost_supported; + bool hw_prefcore; /* EPP feature related attributes*/ s16 epp_policy; -- cgit v1.2.3 From 9c4a13a08a9b7afa4bc33f57675358f0195e302c Mon Sep 17 00:00:00 2001 From: Meng Li Date: Fri, 19 Jan 2024 17:04:59 +0800 Subject: ACPI: cpufreq: Add highest perf change notification Platform firmware sends notify 0x85 to inform the OS that the highest performance of a CPU has changed. This will be used by the AMD P-state driver to update the ranking of preferred cores and set the priority of cores accordingly. Tested-by: Oleksandr Natalenko Reviewed-by: Mario Limonciello Reviewed-by: Huang Rui Reviewed-by: Perry Yuan Signed-off-by: Meng Li Link: https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-device-notification-values [ rjw: New subject, changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index afda5f24d3dd..9bebeec24abb 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -263,6 +263,7 @@ static inline bool cpufreq_supports_freq_invariance(void) return false; } static inline void disable_cpufreq(void) { } +static inline void cpufreq_update_limits(unsigned int cpu) { } #endif #ifdef CONFIG_CPU_FREQ_STAT -- cgit v1.2.3 From e571a5e2068ef57945fcd5d0fb950f8f96da6dc8 Mon Sep 17 00:00:00 2001 From: Meng Li Date: Fri, 19 Jan 2024 17:05:00 +0800 Subject: cpufreq: amd-pstate: Update amd-pstate preferred core ranking dynamically Preferred core rankings can be changed dynamically by the platform based on the workload and platform conditions and accounting for thermals and aging. When this occurs, cpu priority need to be set. Tested-by: Oleksandr Natalenko Reviewed-by: Mario Limonciello Reviewed-by: Wyes Karny Reviewed-by: Huang Rui Reviewed-by: Perry Yuan Signed-off-by: Meng Li Signed-off-by: Rafael J. Wysocki --- include/linux/amd-pstate.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index 68fc1bd8d851..d21838835abd 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -39,11 +39,16 @@ struct amd_aperf_mperf { * @cppc_req_cached: cached performance request hints * @highest_perf: the maximum performance an individual processor may reach, * assuming ideal conditions + * For platforms that do not support the preferred core feature, the + * highest_pef may be configured with 166 or 255, to avoid max frequency + * calculated wrongly. we take the fixed value as the highest_perf. * @nominal_perf: the maximum sustained performance level of the processor, * assuming ideal operating conditions * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power * savings are achieved * @lowest_perf: the absolute lowest performance level of the processor + * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher + * priority. * @max_freq: the frequency that mapped to highest_perf * @min_freq: the frequency that mapped to lowest_perf * @nominal_freq: the frequency that mapped to nominal_perf @@ -73,6 +78,7 @@ struct amd_cpudata { u32 nominal_perf; u32 lowest_nonlinear_perf; u32 lowest_perf; + u32 prefcore_ranking; u32 min_limit_perf; u32 max_limit_perf; u32 min_limit_freq; -- cgit v1.2.3 From bc88528cda2eddc3e5ea304fc3f147f1b4186aa4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:09:44 +0100 Subject: PM: sleep: stats: Use array of suspend step names Replace suspend_step_name() in the suspend statistics code with an array of suspend step names which has fewer lines of code and less overhead. While at it, remove two unnecessary line breaks in suspend_stats_show() and adjust some white space in there to the kernel coding style for a more consistent code layout. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- include/linux/suspend.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index ef503088942d..58f7352af205 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -41,7 +41,8 @@ typedef int __bitwise suspend_state_t; #define PM_SUSPEND_MAX ((__force suspend_state_t) 4) enum suspend_stat_step { - SUSPEND_FREEZE = 1, + SUSPEND_WORKING = 0, + SUSPEND_FREEZE, SUSPEND_PREPARE, SUSPEND_SUSPEND, SUSPEND_SUSPEND_LATE, -- cgit v1.2.3 From b730bab0b9c4204d7dda3f5bc8adf4292497fc39 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:11:57 +0100 Subject: PM: sleep: stats: Use an array of step failure counters Instead of using a set of individual struct suspend_stats fields representing suspend step failure counters, use an array of counters indexed by enum suspend_stat_step for this purpose, which allows dpm_save_failed_step() to increment the appropriate counter automatically, so that its callers don't need to do that directly. It also allows suspend_stats_show() to carry out a loop over the counters array to print their values. Because the counters cannot become negative, use unsigned int for representing them. The only user-observable impact of this change is a different ordering of entries in the suspend_stats debugfs file which is not expected to matter. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- include/linux/suspend.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 58f7352af205..5e4c4d4aed95 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -52,17 +52,12 @@ enum suspend_stat_step { SUSPEND_RESUME }; +#define SUSPEND_NR_STEPS SUSPEND_RESUME + struct suspend_stats { + unsigned int step_failures[SUSPEND_NR_STEPS]; int success; int fail; - int failed_freeze; - int failed_prepare; - int failed_suspend; - int failed_suspend_late; - int failed_suspend_noirq; - int failed_resume; - int failed_resume_early; - int failed_resume_noirq; #define REC_FAILED_NUM 2 int last_failed_dev; char failed_devs[REC_FAILED_NUM][40]; @@ -95,6 +90,7 @@ static inline void dpm_save_failed_errno(int err) static inline void dpm_save_failed_step(enum suspend_stat_step step) { + suspend_stats.step_failures[step-1]++; suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; suspend_stats.last_failed_step++; suspend_stats.last_failed_step %= REC_FAILED_NUM; -- cgit v1.2.3 From 2231f78d3e15e45abe534db1997bc6a2153dc01c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:13:14 +0100 Subject: PM: sleep: stats: Use unsigned int for success and failure counters Change the type of the "success" and "fail" fields in struct suspend_stats to unsigned int, because they cannot be negative. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- include/linux/suspend.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 5e4c4d4aed95..216bae989535 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -56,8 +56,8 @@ enum suspend_stat_step { struct suspend_stats { unsigned int step_failures[SUSPEND_NR_STEPS]; - int success; - int fail; + unsigned int success; + unsigned int fail; #define REC_FAILED_NUM 2 int last_failed_dev; char failed_devs[REC_FAILED_NUM][40]; -- cgit v1.2.3 From 9ff544fa5f94fe07f99a36d2138075b322067546 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:30:44 +0100 Subject: PM: sleep: stats: Define suspend_stats next to the code using it It is not necessary to define struct suspend_stats in a header file and the suspend_stats variable in the core device system-wide PM code. They both can be defined in kernel/power/main.c, next to the sysfs and debugfs code accessing suspend_stats, which can be static. Modify the code in question in accordance with the above observation and replace the static inline functions manipulating suspend_stats with regular ones defined in kernel/power/main.c. While at it, move the enum suspend_stat_step to the end of suspend.h which is a more suitable place for it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson --- include/linux/suspend.h | 71 +++++++++++-------------------------------------- 1 file changed, 15 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 216bae989535..da6ebca3ff77 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -40,62 +40,6 @@ typedef int __bitwise suspend_state_t; #define PM_SUSPEND_MIN PM_SUSPEND_TO_IDLE #define PM_SUSPEND_MAX ((__force suspend_state_t) 4) -enum suspend_stat_step { - SUSPEND_WORKING = 0, - SUSPEND_FREEZE, - SUSPEND_PREPARE, - SUSPEND_SUSPEND, - SUSPEND_SUSPEND_LATE, - SUSPEND_SUSPEND_NOIRQ, - SUSPEND_RESUME_NOIRQ, - SUSPEND_RESUME_EARLY, - SUSPEND_RESUME -}; - -#define SUSPEND_NR_STEPS SUSPEND_RESUME - -struct suspend_stats { - unsigned int step_failures[SUSPEND_NR_STEPS]; - unsigned int success; - unsigned int fail; -#define REC_FAILED_NUM 2 - int last_failed_dev; - char failed_devs[REC_FAILED_NUM][40]; - int last_failed_errno; - int errno[REC_FAILED_NUM]; - int last_failed_step; - u64 last_hw_sleep; - u64 total_hw_sleep; - u64 max_hw_sleep; - enum suspend_stat_step failed_steps[REC_FAILED_NUM]; -}; - -extern struct suspend_stats suspend_stats; - -static inline void dpm_save_failed_dev(const char *name) -{ - strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], - name, - sizeof(suspend_stats.failed_devs[0])); - suspend_stats.last_failed_dev++; - suspend_stats.last_failed_dev %= REC_FAILED_NUM; -} - -static inline void dpm_save_failed_errno(int err) -{ - suspend_stats.errno[suspend_stats.last_failed_errno] = err; - suspend_stats.last_failed_errno++; - suspend_stats.last_failed_errno %= REC_FAILED_NUM; -} - -static inline void dpm_save_failed_step(enum suspend_stat_step step) -{ - suspend_stats.step_failures[step-1]++; - suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; - suspend_stats.last_failed_step++; - suspend_stats.last_failed_step %= REC_FAILED_NUM; -} - /** * struct platform_suspend_ops - Callbacks for managing platform dependent * system sleep states. @@ -623,4 +567,19 @@ static inline void queue_up_suspend_work(void) {} #endif /* !CONFIG_PM_AUTOSLEEP */ +enum suspend_stat_step { + SUSPEND_WORKING = 0, + SUSPEND_FREEZE, + SUSPEND_PREPARE, + SUSPEND_SUSPEND, + SUSPEND_SUSPEND_LATE, + SUSPEND_SUSPEND_NOIRQ, + SUSPEND_RESUME_NOIRQ, + SUSPEND_RESUME_EARLY, + SUSPEND_RESUME +}; + +void dpm_save_failed_dev(const char *name); +void dpm_save_failed_step(enum suspend_stat_step step); + #endif /* _LINUX_SUSPEND_H */ -- cgit v1.2.3 From a3c78778f50c4db6cc0bb6aa2986c0174b1267d0 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:38 +0000 Subject: PM: EM: Refactor em_pd_get_efficient_state() to be more flexible The Energy Model (EM) is going to support runtime modification. There are going to be 2 EM tables which store information. This patch aims to prepare the code to be generic and use one of the tables. The function will no longer get a pointer to 'struct em_perf_domain' (the EM) but instead a pointer to 'struct em_perf_state' (which is one of the EM's tables). Prepare em_pd_get_efficient_state() for the upcoming changes and make it possible to be re-used. Return an index for the best performance state for a given EM table. The function arguments that are introduced should allow to work on different performance state arrays. The caller of em_pd_get_efficient_state() should be able to use the index either on the default or the modifiable EM table. Reviewed-by: Daniel Lezcano Reviewed-by: Hongyan Xia Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 88d91e087471..1dcd1645dde7 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -175,33 +175,35 @@ void em_dev_unregister_perf_domain(struct device *dev); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM - * @pd : Performance domain for which we want an efficient frequency - * @freq : Frequency to map with the EM + * @table: List of performance states, in ascending order + * @nr_perf_states: Number of performance states + * @freq: Frequency to map with the EM + * @pd_flags: Performance Domain flags * * It is called from the scheduler code quite frequently and as a consequence * doesn't implement any check. * - * Return: An efficient performance state, high enough to meet @freq + * Return: An efficient performance state id, high enough to meet @freq * requirement. */ -static inline -struct em_perf_state *em_pd_get_efficient_state(struct em_perf_domain *pd, - unsigned long freq) +static inline int +em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states, + unsigned long freq, unsigned long pd_flags) { struct em_perf_state *ps; int i; - for (i = 0; i < pd->nr_perf_states; i++) { - ps = &pd->table[i]; + for (i = 0; i < nr_perf_states; i++) { + ps = &table[i]; if (ps->frequency >= freq) { - if (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && + if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && ps->flags & EM_PERF_STATE_INEFFICIENT) continue; - break; + return i; } } - return ps; + return nr_perf_states - 1; } /** @@ -226,7 +228,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, { unsigned long freq, ref_freq, scale_cpu; struct em_perf_state *ps; - int cpu; + int cpu, i; if (!sum_util) return 0; @@ -250,7 +252,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested frequency. */ - ps = em_pd_get_efficient_state(pd, freq); + i = em_pd_get_efficient_state(pd->table, pd->nr_perf_states, freq, + pd->flags); + ps = &pd->table[i]; /* * The capacity of a CPU in the domain at the performance state (ps) -- cgit v1.2.3 From ca0fc871f16f4bef746b5ba814b67afb59119700 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:42 +0000 Subject: PM: EM: Introduce runtime modifiable table The new runtime table can be populated with a new power data to better reflect the actual efficiency of the device e.g. CPU. The power can vary over time e.g. due to the SoC temperature change. Higher temperature can increase power values. For longer running scenarios, such as game or camera, when also other devices are used (e.g. GPU, ISP) the CPU power can change. The new EM framework is able to addresses this issue and change the EM data at runtime safely. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 1dcd1645dde7..8ddf1d8a9581 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -36,9 +36,20 @@ struct em_perf_state { */ #define EM_PERF_STATE_INEFFICIENT BIT(0) +/** + * struct em_perf_table - Performance states table + * @rcu: RCU used for safe access and destruction + * @state: List of performance states, in ascending order + */ +struct em_perf_table { + struct rcu_head rcu; + struct em_perf_state state[]; +}; + /** * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order + * @em_table: Pointer to the runtime modifiable em_perf_table * @nr_perf_states: Number of performance states * @flags: See "em_perf_domain flags" * @cpus: Cpumask covering the CPUs of the domain. It's here @@ -54,6 +65,7 @@ struct em_perf_state { */ struct em_perf_domain { struct em_perf_state *table; + struct em_perf_table __rcu *em_table; int nr_perf_states; unsigned long flags; unsigned long cpus[]; -- cgit v1.2.3 From aa11a7ebfd5d698f541641922beede1cb474bf70 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:43 +0000 Subject: PM: EM: Use runtime modified EM for CPUs energy estimation in EAS The new Energy Model (EM) supports runtime modification of the performance state table to better model the power used by the SoC. Use this new feature to improve energy estimation and therefore task placement in Energy Aware Scheduler (EAS). Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 8ddf1d8a9581..5f842da3bb0c 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -239,9 +239,14 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long allowed_cpu_cap) { unsigned long freq, ref_freq, scale_cpu; + struct em_perf_table *em_table; struct em_perf_state *ps; int cpu, i; +#ifdef CONFIG_SCHED_DEBUG + WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); +#endif + if (!sum_util) return 0; @@ -264,9 +269,10 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested frequency. */ - i = em_pd_get_efficient_state(pd->table, pd->nr_perf_states, freq, - pd->flags); - ps = &pd->table[i]; + em_table = rcu_dereference(pd->em_table); + i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states, + freq, pd->flags); + ps = &em_table->state[i]; /* * The capacity of a CPU in the domain at the performance state (ps) -- cgit v1.2.3 From ffcf9bce7af02a21fb73738999de1e3d4fde5aca Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:44 +0000 Subject: PM: EM: Add functions for memory allocations for new EM tables The runtime modified EM table can be provided from drivers. Create mechanism which allows safely allocate and free the table for device drivers. The same table can be used by the EAS in task scheduler code paths, so make sure the memory is not freed when the device driver module is unloaded. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 5f842da3bb0c..27911dc1887e 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -39,10 +40,12 @@ struct em_perf_state { /** * struct em_perf_table - Performance states table * @rcu: RCU used for safe access and destruction + * @kref: Reference counter to track the users * @state: List of performance states, in ascending order */ struct em_perf_table { struct rcu_head rcu; + struct kref kref; struct em_perf_state state[]; }; @@ -184,6 +187,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, struct em_data_callback *cb, cpumask_t *span, bool microwatts); void em_dev_unregister_perf_domain(struct device *dev); +struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); +void em_table_free(struct em_perf_table __rcu *table); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM @@ -365,6 +370,12 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) { return 0; } +static inline +struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +{ + return NULL; +} +static inline void em_table_free(struct em_perf_table __rcu *table) {} #endif #endif -- cgit v1.2.3 From 977230d5d50314f9920d3ee6348773d8babbfb58 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:45 +0000 Subject: PM: EM: Introduce em_dev_update_perf_domain() for EM updates Add API function em_dev_update_perf_domain() which allows the EM to be changed safely. Concurrent updaters are serialized with a mutex and the removal of memory that will not be used any more is carried out with the help of RCU. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 27911dc1887e..324a3a8e0a2d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -183,6 +183,8 @@ struct em_data_callback { struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); +int em_dev_update_perf_domain(struct device *dev, + struct em_perf_table __rcu *new_table); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, struct em_data_callback *cb, cpumask_t *span, bool microwatts); @@ -376,6 +378,12 @@ struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) return NULL; } static inline void em_table_free(struct em_perf_table __rcu *table) {} +static inline +int em_dev_update_perf_domain(struct device *dev, + struct em_perf_table __rcu *new_table) +{ + return -EINVAL; +} #endif #endif -- cgit v1.2.3 From ee1a19873ce1234a3c2e6f84af3624fc73bfbd9c Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:46 +0000 Subject: PM: EM: Add em_perf_state_from_pd() to get performance states table Introduce a wrapper to get the performance states table of the performance domain. The function should be called within the RCU read critical section. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 324a3a8e0a2d..158dad6ea313 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -338,6 +338,23 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) return pd->nr_perf_states; } +/** + * em_perf_state_from_pd() - Get the performance states table of perf. + * domain + * @pd : performance domain for which this must be done + * + * To use this function the rcu_read_lock() should be hold. After the usage + * of the performance states table is finished, the rcu_read_unlock() should + * be called. + * + * Return: the pointer to performance states table of the performance domain + */ +static inline +struct em_perf_state *em_perf_state_from_pd(struct em_perf_domain *pd) +{ + return rcu_dereference(pd->em_table)->state; +} + #else struct em_data_callback {}; #define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) { } @@ -384,6 +401,11 @@ int em_dev_update_perf_domain(struct device *dev, { return -EINVAL; } +static inline +struct em_perf_state *em_perf_state_from_pd(struct em_perf_domain *pd) +{ + return NULL; +} #endif #endif -- cgit v1.2.3 From 5a367f7b7014af86bd1ac0865a42db55187dbd3c Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:47 +0000 Subject: PM: EM: Add performance field to struct em_perf_state and optimize The performance doesn't scale linearly with the frequency. Also, it may be different in different workloads. Some CPUs are designed to be particularly good at some applications e.g. images or video processing and other CPUs in different. When those different types of CPUs are combined in one SoC they should be properly modeled to get max of the HW in Energy Aware Scheduler (EAS). The Energy Model (EM) provides the power vs. performance curves to the EAS, but assumes the CPUs capacity is fixed and scales linearly with the frequency. This patch allows to adjust the curve on the 'performance' axis as well. Code speed optimization: Removing map_util_freq() allows to avoid one division and one multiplication operations from the EAS hot code path. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 158dad6ea313..ce24ea3fe41c 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -13,6 +13,7 @@ /** * struct em_perf_state - Performance state of a performance domain + * @performance: CPU performance (capacity) at a given frequency * @frequency: The frequency in KHz, for consistency with CPUFreq * @power: The power consumed at this level (by 1 CPU or by a registered * device). It can be a total power: static and dynamic. @@ -21,6 +22,7 @@ * @flags: see "em_perf_state flags" description below. */ struct em_perf_state { + unsigned long performance; unsigned long frequency; unsigned long power; unsigned long cost; @@ -196,25 +198,25 @@ void em_table_free(struct em_perf_table __rcu *table); * em_pd_get_efficient_state() - Get an efficient performance state from the EM * @table: List of performance states, in ascending order * @nr_perf_states: Number of performance states - * @freq: Frequency to map with the EM + * @max_util: Max utilization to map with the EM * @pd_flags: Performance Domain flags * * It is called from the scheduler code quite frequently and as a consequence * doesn't implement any check. * - * Return: An efficient performance state id, high enough to meet @freq + * Return: An efficient performance state id, high enough to meet @max_util * requirement. */ static inline int em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states, - unsigned long freq, unsigned long pd_flags) + unsigned long max_util, unsigned long pd_flags) { struct em_perf_state *ps; int i; for (i = 0; i < nr_perf_states; i++) { ps = &table[i]; - if (ps->frequency >= freq) { + if (ps->performance >= max_util) { if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && ps->flags & EM_PERF_STATE_INEFFICIENT) continue; @@ -245,9 +247,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util, unsigned long allowed_cpu_cap) { - unsigned long freq, ref_freq, scale_cpu; struct em_perf_table *em_table; struct em_perf_state *ps; + unsigned long scale_cpu; int cpu, i; #ifdef CONFIG_SCHED_DEBUG @@ -260,25 +262,23 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, /* * In order to predict the performance state, map the utilization of * the most utilized CPU of the performance domain to a requested - * frequency, like schedutil. Take also into account that the real - * frequency might be set lower (due to thermal capping). Thus, clamp + * performance, like schedutil. Take also into account that the real + * performance might be set lower (due to thermal capping). Thus, clamp * max utilization to the allowed CPU capacity before calculating - * effective frequency. + * effective performance. */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); - ref_freq = arch_scale_freq_ref(cpu); max_util = min(max_util, allowed_cpu_cap); - freq = map_util_freq(max_util, ref_freq, scale_cpu); /* * Find the lowest performance state of the Energy Model above the - * requested frequency. + * requested performance. */ em_table = rcu_dereference(pd->em_table); i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states, - freq, pd->flags); + max_util, pd->flags); ps = &em_table->state[i]; /* -- cgit v1.2.3 From 1b600da510735a0f92c8b4140a7e2cb037a6a6c3 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:49 +0000 Subject: PM: EM: Optimize em_cpu_energy() and remove division The Energy Model (EM) can be modified at runtime which brings new possibilities. The em_cpu_energy() is called by the Energy Aware Scheduler (EAS) in its hot path. The energy calculation uses power value for a given performance state (ps) and the CPU busy time as percentage for that given frequency. It is possible to avoid the division by 'scale_cpu' at runtime, because EM is updated whenever new max capacity CPU is set in the system. Use that feature and do the needed division during the calculation of the coefficient 'ps->cost'. That enhanced 'ps->cost' value can be then just multiplied simply by utilization: pd_nrg = ps->cost * \Sum cpu_util to get the needed energy for whole Performance Domain (PD). With this optimization and earlier removal of map_util_freq(), the em_cpu_energy() should run faster on the Big CPU by 1.43x and on the Little CPU by 1.69x (RockPi 4B board). Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 55 ++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index ce24ea3fe41c..aabfc26fcd31 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -115,27 +115,6 @@ struct em_perf_domain { #define EM_MAX_NUM_CPUS 16 #endif -/* - * To avoid an overflow on 32bit machines while calculating the energy - * use a different order in the operation. First divide by the 'cpu_scale' - * which would reduce big value stored in the 'cost' field, then multiply by - * the 'sum_util'. This would allow to handle existing platforms, which have - * e.g. power ~1.3 Watt at max freq, so the 'cost' value > 1mln micro-Watts. - * In such scenario, where there are 4 CPUs in the Perf. Domain the 'sum_util' - * could be 4096, then multiplication: 'cost' * 'sum_util' would overflow. - * This reordering of operations has some limitations, we lose small - * precision in the estimation (comparing to 64bit platform w/o reordering). - * - * We are safe on 64bit machine. - */ -#ifdef CONFIG_64BIT -#define em_estimate_energy(cost, sum_util, scale_cpu) \ - (((cost) * (sum_util)) / (scale_cpu)) -#else -#define em_estimate_energy(cost, sum_util, scale_cpu) \ - (((cost) / (scale_cpu)) * (sum_util)) -#endif - struct em_data_callback { /** * active_power() - Provide power at the next performance state of @@ -249,8 +228,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, { struct em_perf_table *em_table; struct em_perf_state *ps; - unsigned long scale_cpu; - int cpu, i; + int i; #ifdef CONFIG_SCHED_DEBUG WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); @@ -267,9 +245,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * max utilization to the allowed CPU capacity before calculating * effective performance. */ - cpu = cpumask_first(to_cpumask(pd->cpus)); - scale_cpu = arch_scale_cpu_capacity(cpu); - + max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); /* @@ -282,12 +258,12 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, ps = &em_table->state[i]; /* - * The capacity of a CPU in the domain at the performance state (ps) - * can be computed as: + * The performance (capacity) of a CPU in the domain at the performance + * state (ps) can be computed as: * - * ps->freq * scale_cpu - * ps->cap = -------------------- (1) - * cpu_max_freq + * ps->freq * scale_cpu + * ps->performance = -------------------- (1) + * cpu_max_freq * * So, ignoring the costs of idle states (which are not available in * the EM), the energy consumed by this CPU at that performance state @@ -295,9 +271,10 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * * ps->power * cpu_util * cpu_nrg = -------------------- (2) - * ps->cap + * ps->performance * - * since 'cpu_util / ps->cap' represents its percentage of busy time. + * since 'cpu_util / ps->performance' represents its percentage of busy + * time. * * NOTE: Although the result of this computation actually is in * units of power, it can be manipulated as an energy value @@ -307,9 +284,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product * of two terms: * - * ps->power * cpu_max_freq cpu_util - * cpu_nrg = ------------------------ * --------- (3) - * ps->freq scale_cpu + * ps->power * cpu_max_freq + * cpu_nrg = ------------------------ * cpu_util (3) + * ps->freq * scale_cpu * * The first term is static, and is stored in the em_perf_state struct * as 'ps->cost'. @@ -319,11 +296,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * total energy of the domain (which is the simple sum of the energy of * all of its CPUs) can be factorized as: * - * ps->cost * \Sum cpu_util - * pd_nrg = ------------------------ (4) - * scale_cpu + * pd_nrg = ps->cost * \Sum cpu_util (4) */ - return em_estimate_energy(ps->cost, sum_util, scale_cpu); + return ps->cost * sum_util; } /** -- cgit v1.2.3 From 24e9fb635df2790eccb0e95ff65c6dee7a97fcb7 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:55 +0000 Subject: PM: EM: Remove old table Remove the old EM table which wasn't able to modify the data. Clean the unneeded function and refactor the code a bit. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index aabfc26fcd31..92866a81abe4 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -53,7 +53,6 @@ struct em_perf_table { /** * struct em_perf_domain - Performance domain - * @table: List of performance states, in ascending order * @em_table: Pointer to the runtime modifiable em_perf_table * @nr_perf_states: Number of performance states * @flags: See "em_perf_domain flags" @@ -69,7 +68,6 @@ struct em_perf_table { * field is unused. */ struct em_perf_domain { - struct em_perf_state *table; struct em_perf_table __rcu *em_table; int nr_perf_states; unsigned long flags; -- cgit v1.2.3 From 22ea02848c07d1cbd15a5f442138ca429866300d Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:56 +0000 Subject: PM: EM: Add em_dev_compute_costs() The device drivers can modify EM at runtime by providing a new EM table. The EM is used by the EAS and the em_perf_state::cost stores pre-calculated value to avoid overhead. This patch provides the API for device drivers to calculate the cost values properly (and not duplicate the same code). Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 92866a81abe4..770755df852f 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -170,6 +170,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, void em_dev_unregister_perf_domain(struct device *dev); struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); void em_table_free(struct em_perf_table __rcu *table); +int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, + int nr_states); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM @@ -379,6 +381,12 @@ struct em_perf_state *em_perf_state_from_pd(struct em_perf_domain *pd) { return NULL; } +static inline +int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, + int nr_states) +{ + return -EINVAL; +} #endif #endif -- cgit v1.2.3 From c0ef3df8dbaef51ee4cfd58a471adf2eaee6f6b3 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 30 Jan 2024 13:28:05 +0200 Subject: PM: runtime: Simplify pm_runtime_get_if_active() usage There are two ways to opportunistically increment a device's runtime PM usage count, calling either pm_runtime_get_if_active() or pm_runtime_get_if_in_use(). The former has an argument to tell whether to ignore the usage count or not, and the latter simply calls the former with ign_usage_count set to false. The other users that want to ignore the usage_count will have to explicitly set that argument to true which is a bit cumbersome. To make this function more practical to use, remove the ign_usage_count argument from the function. The main implementation is in a static function called pm_runtime_get_conditional() and implementations of pm_runtime_get_if_active() and pm_runtime_get_if_in_use() are moved to runtime.c. Signed-off-by: Sakari Ailus Reviewed-by: Alex Elder Reviewed-by: Laurent Pinchart Acked-by: Takashi Iwai # sound/ Reviewed-by: Jacek Lawrynowicz # drivers/accel/ivpu/ Acked-by: Rodrigo Vivi # drivers/gpu/drm/i915/ Reviewed-by: Rodrigo Vivi Acked-by: Bjorn Helgaas # drivers/pci/ Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 7c9b35448563..436baa167498 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -72,7 +72,8 @@ extern int pm_runtime_force_resume(struct device *dev); extern int __pm_runtime_idle(struct device *dev, int rpmflags); extern int __pm_runtime_suspend(struct device *dev, int rpmflags); extern int __pm_runtime_resume(struct device *dev, int rpmflags); -extern int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count); +extern int pm_runtime_get_if_active(struct device *dev); +extern int pm_runtime_get_if_in_use(struct device *dev); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); @@ -94,18 +95,6 @@ extern void pm_runtime_release_supplier(struct device_link *link); extern int devm_pm_runtime_enable(struct device *dev); -/** - * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. - * @dev: Target device. - * - * Increment the runtime PM usage counter of @dev if its runtime PM status is - * %RPM_ACTIVE and its runtime PM usage counter is greater than 0. - */ -static inline int pm_runtime_get_if_in_use(struct device *dev) -{ - return pm_runtime_get_if_active(dev, false); -} - /** * pm_suspend_ignore_children - Set runtime PM behavior regarding children. * @dev: Target device. @@ -275,8 +264,7 @@ static inline int pm_runtime_get_if_in_use(struct device *dev) { return -EINVAL; } -static inline int pm_runtime_get_if_active(struct device *dev, - bool ign_usage_count) +static inline int pm_runtime_get_if_active(struct device *dev) { return -EINVAL; } -- cgit v1.2.3 From b7d46644e554ed017dfabc0841acf418d0584bc9 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 30 Jan 2024 13:28:32 +0200 Subject: PM: runtime: Add pm_runtime_put_autosuspend() replacement Add __pm_runtime_put_autosuspend() that replaces pm_runtime_put_autosuspend() for new users. The intent is to later re-purpose pm_runtime_put_autosuspend() to also mark the device's last busy stamp---which is what the vast majority of users actually need. This is also described in pm_runtime_put_autosuspend() documentation. Signed-off-by: Sakari Ailus Reviewed-by: Laurent Pinchart Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 436baa167498..d39dc863f612 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -448,6 +448,18 @@ static inline int pm_runtime_put(struct device *dev) return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } +/** + * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. + * @dev: Target device. + * + * Decrement the runtime PM usage counter of @dev and if it turns out to be + * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). + */ +static inline int __pm_runtime_put_autosuspend(struct device *dev) +{ + return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_ASYNC | RPM_AUTO); +} + /** * pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. -- cgit v1.2.3 From 1aa09b9379a7a644cd2f75ae0bac82b8783df600 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 31 Jan 2024 19:37:09 +0800 Subject: powercap: intel_rapl: Fix locking in TPMI RAPL The RAPL framework uses CPU hotplug locking to protect the rapl_packages list and rp->lead_cpu to guarantee that 1. the RAPL package device is not unprobed and freed 2. the cached rp->lead_cpu is always valid for operations like powercap sysfs accesses. Current RAPL APIs assume being called from CPU hotplug callbacks which hold the CPU hotplug lock, but TPMI RAPL driver invokes the APIs in the driver's .probe() function without acquiring the CPU hotplug lock. Fix the problem by providing both locked and lockless versions of RAPL APIs. Fixes: 9eef7f9da928 ("powercap: intel_rapl: Introduce RAPL TPMI interface driver") Signed-off-by: Zhang Rui Cc: 6.5+ # 6.5+ Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 33f21bd85dbf..f3196f82fd8a 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -178,6 +178,12 @@ struct rapl_package { struct rapl_if_priv *priv; }; +struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv, + bool id_is_cpu); +struct rapl_package *rapl_add_package_cpuslocked(int id, struct rapl_if_priv *priv, + bool id_is_cpu); +void rapl_remove_package_cpuslocked(struct rapl_package *rp); + struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu); struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu); void rapl_remove_package(struct rapl_package *rp); -- cgit v1.2.3 From 015abee404760249a5c968b9ce29216b94b8ced1 Mon Sep 17 00:00:00 2001 From: Vilas Bhat Date: Wed, 21 Feb 2024 19:47:53 +0000 Subject: PM: runtime: add tracepoint for runtime_status changes Existing runtime PM ftrace events (`rpm_suspend`, `rpm_resume`, `rpm_return_int`) offer limited visibility into the exact timing of device runtime power state transitions, particularly when asynchronous operations are involved. When the `rpm_suspend` or `rpm_resume` functions are invoked with the `RPM_ASYNC` flag, a return value of 0 i.e., success merely indicates that the device power state request has been queued, not that the device has yet transitioned. A new ftrace event, `rpm_status`, is introduced. This event directly logs the `power.runtime_status` value of a device whenever it changes providing granular tracking of runtime power state transitions regardless of synchronous or asynchronous `rpm_suspend` / `rpm_resume` usage. Signed-off-by: Vilas Bhat Signed-off-by: Rafael J. Wysocki --- include/trace/events/rpm.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include') diff --git a/include/trace/events/rpm.h b/include/trace/events/rpm.h index 3c716214dab1..bd120e23ce12 100644 --- a/include/trace/events/rpm.h +++ b/include/trace/events/rpm.h @@ -101,6 +101,48 @@ TRACE_EVENT(rpm_return_int, __entry->ret) ); +#define RPM_STATUS_STRINGS \ + EM(RPM_INVALID, "RPM_INVALID") \ + EM(RPM_ACTIVE, "RPM_ACTIVE") \ + EM(RPM_RESUMING, "RPM_RESUMING") \ + EM(RPM_SUSPENDED, "RPM_SUSPENDED") \ + EMe(RPM_SUSPENDING, "RPM_SUSPENDING") + +/* Enums require being exported to userspace, for user tool parsing. */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +RPM_STATUS_STRINGS + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings that + * will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +TRACE_EVENT(rpm_status, + TP_PROTO(struct device *dev, enum rpm_status status), + TP_ARGS(dev, status), + + TP_STRUCT__entry( + __string(name, dev_name(dev)) + __field(int, status) + ), + + TP_fast_assign( + __assign_str(name, dev_name(dev)); + __entry->status = status; + ), + + TP_printk("%s status=%s", __get_str(name), + __print_symbolic(__entry->status, RPM_STATUS_STRINGS)) +); + #endif /* _TRACE_RUNTIME_POWER_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 88debc69754f7fe5186954941bb1cc4d744f4f25 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 22 Feb 2024 16:34:15 +0100 Subject: cpufreq: Remove references to 10ms min sampling rate A minimum sampling rate value of 10ms was introduced in: commit cef9615a853e ("[CPUFREQ] ondemand: Uncouple minimal sampling rate from HZ in NO_HZ case") The use of this value was removed in: commit ed4676e25463 ("cpufreq: Replace "max_transition_latency" with "dynamic_switching"") Remove: - a comment referencing this value - an unused macro associated to this value Signed-off-by: Pierre Gondois Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9bebeec24abb..85908b3a2f24 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -569,9 +569,7 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div, /* * The polling frequency depends on the capability of the processor. Default - * polling frequency is 1000 times the transition latency of the processor. The - * ondemand governor will work on any processor with transition latency <= 10ms, - * using appropriate sampling rate. + * polling frequency is 1000 times the transition latency of the processor. */ #define LATENCY_MULTIPLIER (1000) -- cgit v1.2.3 From d394abcb12bb1a6f309c1221fdb8e73594ecf1b4 Mon Sep 17 00:00:00 2001 From: Shivnandan Kumar Date: Tue, 27 Feb 2024 14:43:51 +0530 Subject: cpufreq: Limit resolving a frequency to policy min/max Resolving a frequency to an efficient one should not transgress policy->max (which can be set for thermal reason) and policy->min. Currently, there is possibility where scaling_cur_freq can exceed scaling_max_freq when scaling_max_freq is an inefficient frequency. Add a check to ensure that resolving a frequency will respect policy->min/max. Cc: All applicable Fixes: 1f39fa0dccff ("cpufreq: Introducing CPUFREQ_RELATION_E") Signed-off-by: Shivnandan Kumar [ rjw: Whitespace adjustment, changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 85908b3a2f24..692ea6e55129 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1020,6 +1020,18 @@ static inline int cpufreq_table_find_index_c(struct cpufreq_policy *policy, efficiencies); } +static inline bool cpufreq_is_in_limits(struct cpufreq_policy *policy, int idx) +{ + unsigned int freq; + + if (idx < 0) + return false; + + freq = policy->freq_table[idx].frequency; + + return freq == clamp_val(freq, policy->min, policy->max); +} + static inline int cpufreq_frequency_table_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) @@ -1053,7 +1065,8 @@ retry: return 0; } - if (idx < 0 && efficiencies) { + /* Limit frequency index to honor policy->min/max */ + if (!cpufreq_is_in_limits(policy, idx) && efficiencies) { efficiencies = false; goto retry; } -- cgit v1.2.3 From ad86f7e959dc1814c3b2bcbeb08c3c02214110a7 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 22 Feb 2024 14:56:59 +0100 Subject: firmware: arm_scmi: Populate perf commands rate_limit Arm SCMI spec. v3.2, s4.5.3.4 PERFORMANCE_DOMAIN_ATTRIBUTES defines a per-domain rate_limit for performance requests: """ Rate Limit in microseconds, indicating the minimum time required between successive requests. A value of 0 indicates that this field is not supported by the platform. This field does not apply to FastChannels. """" The field is first defined in SCMI v1.0. Add support to fetch this value and advertise it through a rate_limit_get() callback. Signed-off-by: Pierre Gondois Reviewed-by: Cristian Marussi Acked-by: Sudeep Holla Signed-off-by: Viresh Kumar --- include/linux/scmi_protocol.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index f2f05fb42d28..acd956ffcb84 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -128,6 +128,8 @@ struct scmi_perf_domain_info { * @level_set: sets the performance level of a domain * @level_get: gets the performance level of a domain * @transition_latency_get: gets the DVFS transition latency for a given device + * @rate_limit_get: gets the minimum time (us) required between successive + * requests * @device_opps_add: adds all the OPPs for a given device * @freq_set: sets the frequency for a given device using sustained frequency * to sustained performance level mapping @@ -154,6 +156,8 @@ struct scmi_perf_proto_ops { u32 *level, bool poll); int (*transition_latency_get)(const struct scmi_protocol_handle *ph, u32 domain); + int (*rate_limit_get)(const struct scmi_protocol_handle *ph, + u32 domain, u32 *rate_limit); int (*device_opps_add)(const struct scmi_protocol_handle *ph, struct device *dev, u32 domain); int (*freq_set)(const struct scmi_protocol_handle *ph, u32 domain, -- cgit v1.2.3 From 2441caa84aac8abf1be9e20db3e6bb921e74c8a2 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 22 Feb 2024 14:57:00 +0100 Subject: firmware: arm_scmi: Populate fast channel rate_limit Arm SCMI spec. v3.2, s4.5.3.12 PERFORMANCE_DESCRIBE_FASTCHANNEL defines a per-domain rate_limit for performance requests: """ Rate Limit in microseconds, indicating the minimum time required between successive requests. A value of 0 indicates that this field is not applicable or supported on the platform. """" The field is first defined in SCMI v2.0. Add support to fetch this value and advertise it through a fast_switch_rate_limit() callback. Signed-off-by: Pierre Gondois Reviewed-by: Cristian Marussi Acked-by: Sudeep Holla Signed-off-by: Viresh Kumar --- include/linux/scmi_protocol.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index acd956ffcb84..fafedb3b6604 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -139,6 +139,8 @@ struct scmi_perf_domain_info { * at a given frequency * @fast_switch_possible: indicates if fast DVFS switching is possible or not * for a given device + * @fast_switch_rate_limit: gets the minimum time (us) required between + * successive fast_switching requests * @power_scale_mw_get: indicates if the power values provided are in milliWatts * or in some other (abstract) scale */ @@ -168,6 +170,8 @@ struct scmi_perf_proto_ops { unsigned long *rate, unsigned long *power); bool (*fast_switch_possible)(const struct scmi_protocol_handle *ph, u32 domain); + int (*fast_switch_rate_limit)(const struct scmi_protocol_handle *ph, + u32 domain, u32 *rate_limit); enum scmi_power_scale (*power_scale_get)(const struct scmi_protocol_handle *ph); }; -- cgit v1.2.3 From abb3f9717a67a2666b2bc2f19543a657e3d4ad63 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Tue, 27 Feb 2024 23:04:32 +0530 Subject: OPP: Extend dev_pm_opp_data with turbo support Let's extend the dev_pm_opp_data with a turbo variable, to allow users to specify if it's a boost frequency for a dynamically added OPP. Signed-off-by: Sibi Sankar Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 76dcb7f37bcd..fa9b63c6bf0b 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -87,12 +87,14 @@ struct dev_pm_opp_config { /** * struct dev_pm_opp_data - The data to use to initialize an OPP. + * @turbo: Flag to indicate whether the OPP is to be marked turbo or not. * @level: The performance level for the OPP. Set level to OPP_LEVEL_UNSET if * level field isn't used. * @freq: The clock rate in Hz for the OPP. * @u_volt: The voltage in uV for the OPP. */ struct dev_pm_opp_data { + bool turbo; unsigned int level; unsigned long freq; unsigned long u_volt; -- cgit v1.2.3 From 838a4772bfc390a14b31c25dc4c9eb66de5f5b1a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 18 Jan 2024 16:19:13 +0530 Subject: cpufreq: Move dev_pm_opp_{init|free}_cpufreq_table() to pm_opp.h Move the declaration of functions defined in the OPP core to pm_opp.h. These were added to cpufreq.h as it was the only user of the APIs, but that was a mistake perhaps. Fix it. Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 20 -------------------- include/linux/pm_opp.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index afda5f24d3dd..8ff3e79727d8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -694,26 +694,6 @@ struct cpufreq_frequency_table { * order */ }; -#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP) -int dev_pm_opp_init_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table); -void dev_pm_opp_free_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table); -#else -static inline int dev_pm_opp_init_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table - **table) -{ - return -EINVAL; -} - -static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table - **table) -{ -} -#endif - /* * cpufreq_for_each_entry - iterate over a cpufreq_frequency_table * @pos: the cpufreq_frequency_table * to use as a loop cursor. diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index fa9b63c6bf0b..065a47382302 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -16,6 +16,7 @@ #include struct clk; +struct cpufreq_frequency_table; struct regulator; struct dev_pm_opp; struct device; @@ -446,6 +447,21 @@ static inline int dev_pm_opp_sync_regulators(struct device *dev) #endif /* CONFIG_PM_OPP */ +#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP) +int dev_pm_opp_init_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table); +void dev_pm_opp_free_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table); +#else +static inline int dev_pm_opp_init_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table) +{ + return -EINVAL; +} + +static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table) +{ +} +#endif + + #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF) int dev_pm_opp_of_add_table(struct device *dev); int dev_pm_opp_of_add_table_indexed(struct device *dev, int index); -- cgit v1.2.3