From de1df26b7cef702a32ae876ed45c1112f523df48 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 5 Feb 2016 02:37:42 +0100
Subject: cpufreq: Clean up default and fallback governor setup

The preprocessor magic used for setting the default cpufreq governor
(and for using the performance governor as a fallback one for that
matter) is really nasty, so replace it with __weak functions and
overrides.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Saravana Kannan <skannan@codeaurora.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/cpufreq.h | 25 ++-----------------------
 1 file changed, 2 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 88a4215125bc..d0bf555b6bbf 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -464,29 +464,8 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 int cpufreq_register_governor(struct cpufreq_governor *governor);
 void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 
-/* CPUFREQ DEFAULT GOVERNOR */
-/*
- * Performance governor is fallback governor if any other gov failed to auto
- * load due latency restrictions
- */
-#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
-extern struct cpufreq_governor cpufreq_gov_performance;
-#endif
-#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
-#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_performance)
-#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE)
-extern struct cpufreq_governor cpufreq_gov_powersave;
-#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_powersave)
-#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE)
-extern struct cpufreq_governor cpufreq_gov_userspace;
-#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_userspace)
-#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND)
-extern struct cpufreq_governor cpufreq_gov_ondemand;
-#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_ondemand)
-#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
-extern struct cpufreq_governor cpufreq_gov_conservative;
-#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_conservative)
-#endif
+struct cpufreq_governor *cpufreq_default_governor(void);
+struct cpufreq_governor *cpufreq_fallback_governor(void);
 
 /*********************************************************************
  *                     FREQUENCY TABLE HELPERS                       *
-- 
cgit v1.2.3


From 9f8ea969d5cfdd4353d2adb004e8e2286b984369 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:33 +0530
Subject: PM / OPP: get/put regulators from OPP core

This allows the OPP core to request/free the regulator resource,
attached to a device OPP. The regulator device is fetched using the name
provided by the driver, while calling: dev_pm_opp_set_regulator().

This will work for both OPP-v1 and v2 bindings.

This is a preliminary step for moving the OPP switching logic into the
OPP core.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_opp.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 95403d2ccaf5..c70a18ac9c8a 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -60,6 +60,8 @@ int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
 void dev_pm_opp_put_supported_hw(struct device *dev);
 int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct device *dev);
+int dev_pm_opp_set_regulator(struct device *dev, const char *name);
+void dev_pm_opp_put_regulator(struct device *dev);
 #else
 static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 {
@@ -151,6 +153,13 @@ static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 
 static inline void dev_pm_opp_put_prop_name(struct device *dev) {}
 
+static inline int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+{
+	return -EINVAL;
+}
+
+static inline void dev_pm_opp_put_regulator(struct device *dev) {}
+
 #endif		/* CONFIG_PM_OPP */
 
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
-- 
cgit v1.2.3


From 655c9df961751ce21466f6e97e8033932c27a675 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:35 +0530
Subject: PM / OPP: Introduce dev_pm_opp_get_max_volt_latency()

In few use cases (like: cpufreq), it is desired to get the maximum
voltage latency for changing OPPs. Add support for that.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_opp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index c70a18ac9c8a..5daa43058ac1 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -34,6 +34,7 @@ bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp);
 
 int dev_pm_opp_get_opp_count(struct device *dev);
 unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
+unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev);
 struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev);
 
 struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
@@ -88,6 +89,11 @@ static inline unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
 	return 0;
 }
 
+static inline unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
+{
+	return 0;
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
 {
 	return NULL;
-- 
cgit v1.2.3


From 2174344765f472895c076d703c9cdc58215e1393 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:36 +0530
Subject: PM / OPP: Introduce dev_pm_opp_get_max_transition_latency()

In few use cases (like: cpufreq), it is desired to get the maximum
latency for changing OPPs. Add support for that.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_opp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 5daa43058ac1..59da3d9e11ea 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -35,6 +35,7 @@ bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp);
 int dev_pm_opp_get_opp_count(struct device *dev);
 unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
 unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev);
+unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev);
 struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev);
 
 struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
@@ -94,6 +95,11 @@ static inline unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 	return 0;
 }
 
+static inline unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
+{
+	return 0;
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
 {
 	return NULL;
-- 
cgit v1.2.3


From 6a0712f6f199e737aa5913d28ec4bd3a25de9660 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 10:30:39 +0530
Subject: PM / OPP: Add dev_pm_opp_set_rate()

This adds a routine, dev_pm_opp_set_rate(), responsible for configuring
power-supply and clock source for an OPP.

The OPP is found by matching against the target_freq passed to the
routine. This shall replace similar code present in most of the OPP
users and help simplify them a lot.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_opp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 59da3d9e11ea..cccaf4a29e9f 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -64,6 +64,7 @@ int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct device *dev);
 int dev_pm_opp_set_regulator(struct device *dev, const char *name);
 void dev_pm_opp_put_regulator(struct device *dev);
+int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 #else
 static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 {
@@ -172,6 +173,11 @@ static inline int dev_pm_opp_set_regulator(struct device *dev, const char *name)
 
 static inline void dev_pm_opp_put_regulator(struct device *dev) {}
 
+static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
+{
+	return -EINVAL;
+}
+
 #endif		/* CONFIG_PM_OPP */
 
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
-- 
cgit v1.2.3


From fc5cbf0c94b6f7fd62fdddff892207290510945d Mon Sep 17 00:00:00 2001
From: Axel Haslam <ahaslam+renesas@baylibre.com>
Date: Mon, 15 Feb 2016 11:10:51 +0100
Subject: PM / Domains: Support for multiple states

Some hardware (eg. OMAP), has the ability to enter different low power
modes for a given power domain. This allows for more fine grained control
over the power state of the platform. As a typical example, some registers
of the hardware may be implemented with retention flip-flops and be able
to retain their state at lower voltages allowing for faster on/off
latencies and an increased window of opportunity to enter an intermediate
low power state other than "off"

When trying to set a power domain to off, the genpd governor will choose
the deepest state that will respect the qos constraints of all the devices
and sub-domains on the power domain. The state chosen by the governor is
saved in the "state_idx" field of the generic_pm_domain structure and
shall be used by the power_off and power_on callbacks to perform the
necessary actions to set the power domain into (and out of) the state
indicated by state_idx.

States must be declared in ascending order from shallowest to deepest,
deepest meaning the state which takes longer to enter and exit.

For platforms that don't declare any states, a single a single "off"
state is used. Once all platforms are converted to use the state array,
the legacy on/off latencies will be removed.

[ Lina: Modified genpd state initialization and remove use of
        save_state_latency_ns in genpd timing data ]
Suggested-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Signed-off-by: Axel Haslam <ahaslam+renesas@baylibre.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_domain.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index db21d3995f7e..1726c4ac6529 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -19,6 +19,8 @@
 /* Defines used for the flags field in the struct generic_pm_domain */
 #define GENPD_FLAG_PM_CLK	(1U << 0) /* PM domain uses PM clk */
 
+#define GENPD_MAX_NUM_STATES	8 /* Number of possible low power states */
+
 enum gpd_status {
 	GPD_STATE_ACTIVE = 0,	/* PM domain is active */
 	GPD_STATE_POWER_OFF,	/* PM domain is off */
@@ -37,6 +39,11 @@ struct gpd_dev_ops {
 	bool (*active_wakeup)(struct device *dev);
 };
 
+struct genpd_power_state {
+	s64 power_off_latency_ns;
+	s64 power_on_latency_ns;
+};
+
 struct generic_pm_domain {
 	struct dev_pm_domain domain;	/* PM domain operations */
 	struct list_head gpd_list_node;	/* Node in the global PM domains list */
@@ -66,6 +73,10 @@ struct generic_pm_domain {
 	void (*detach_dev)(struct generic_pm_domain *domain,
 			   struct device *dev);
 	unsigned int flags;		/* Bit field of configs for genpd */
+	struct genpd_power_state states[GENPD_MAX_NUM_STATES];
+	unsigned int state_count; /* number of states */
+	unsigned int state_idx; /* state that genpd will go to when off */
+
 };
 
 static inline struct generic_pm_domain *pd_to_genpd(struct dev_pm_domain *pd)
-- 
cgit v1.2.3


From 90e63452ac3a42c9ff10b12880429e8592cf39ec Mon Sep 17 00:00:00 2001
From: Axel Haslam <ahaslam+renesas@baylibre.com>
Date: Mon, 15 Feb 2016 11:10:53 +0100
Subject: PM / Domains: remove old power on/off latencies

Now that all known users have been converted to use state latencies,
we can remove the latency field in the generic_pm_domain structure.

Signed-off-by: Axel Haslam <ahaslam+renesas@baylibre.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_domain.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 1726c4ac6529..49cd8890b873 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -61,9 +61,7 @@ struct generic_pm_domain {
 	unsigned int prepared_count;	/* Suspend counter of prepared devices */
 	bool suspend_power_off;	/* Power status before system suspend */
 	int (*power_off)(struct generic_pm_domain *domain);
-	s64 power_off_latency_ns;
 	int (*power_on)(struct generic_pm_domain *domain);
-	s64 power_on_latency_ns;
 	struct gpd_dev_ops dev_ops;
 	s64 max_off_time_ns;	/* Maximum allowed "suspended" time. */
 	bool max_off_time_changed;
-- 
cgit v1.2.3


From 34b0870515aaac6b7ea1ffdc370516b0a8024c82 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 26 Feb 2016 00:22:57 +0100
Subject: cpufreq: Simplify the cpufreq_for_each_valid_entry()

That macro uses an internal static inline function that is first
totally unnecessary and second hard to read, so simplify it and
get rid of that monster.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/cpufreq.h | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d0bf555b6bbf..4064cfcfbffd 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -504,16 +504,6 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
 }
 #endif
 
-static inline bool cpufreq_next_valid(struct cpufreq_frequency_table **pos)
-{
-	while ((*pos)->frequency != CPUFREQ_TABLE_END)
-		if ((*pos)->frequency != CPUFREQ_ENTRY_INVALID)
-			return true;
-		else
-			(*pos)++;
-	return false;
-}
-
 /*
  * cpufreq_for_each_entry -	iterate over a cpufreq_frequency_table
  * @pos:	the cpufreq_frequency_table * to use as a loop cursor.
@@ -530,8 +520,11 @@ static inline bool cpufreq_next_valid(struct cpufreq_frequency_table **pos)
  * @table:      the cpufreq_frequency_table * to iterate over.
  */
 
-#define cpufreq_for_each_valid_entry(pos, table)	\
-	for (pos = table; cpufreq_next_valid(&pos); pos++)
+#define cpufreq_for_each_valid_entry(pos, table)			\
+	for (pos = table; pos->frequency != CPUFREQ_TABLE_END; pos++)	\
+		if (pos->frequency == CPUFREQ_ENTRY_INVALID)		\
+			continue;					\
+		else
 
 int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
 				    struct cpufreq_frequency_table *table);
-- 
cgit v1.2.3


From e237a5518425155faa508a087f28269f58074b92 Mon Sep 17 00:00:00 2001
From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Date: Mon, 15 Feb 2016 12:52:01 +0800
Subject: x86/ACPI/PCI: Recognize that Interrupt Line 255 means "not connected"

Per the x86-specific footnote to PCI spec r3.0, sec 6.2.4, the value 255 in
the Interrupt Line register means "unknown" or "no connection."
Previously, when we couldn't derive an IRQ from the _PRT, we fell back to
using the value from Interrupt Line as an IRQ.  It's questionable whether
we should do that at all, but the spec clearly suggests we shouldn't do it
for the value 255 on x86.

Calling request_irq() with IRQ 255 may succeed, but the driver won't
receive any interrupts.  Or, if IRQ 255 is shared with another device, it
may succeed, and the driver's ISR will be called at random times when the
*other* device interrupts.  Or it may fail if another device is using IRQ
255 with incompatible flags.  What we *want* is for request_irq() to fail
predictably so the driver can fall back to polling.

On x86, assume 255 in the Interrupt Line means the INTx line is not
connected.  In that case, set dev->irq to IRQ_NOTCONNECTED so request_irq()
will fail gracefully with -ENOTCONN.

We found this problem on a system where Secure Boot firmware assigned
Interrupt Line 255 to an i801_smbus device and another device was already
using MSI-X IRQ 255.  This was in v3.10, where i801_probe() fails if
request_irq() fails:

  i801_smbus 0000:00:1f.3: enabling device (0140 -> 0143)
  i801_smbus 0000:00:1f.3: can't derive routing for PCI INT C
  i801_smbus 0000:00:1f.3: PCI INT C: no GSI
  genirq: Flags mismatch irq 255. 00000080 (i801_smbus) vs. 00000000 (megasa)
  CPU: 0 PID: 2487 Comm: kworker/0:1 Not tainted 3.10.0-229.el7.x86_64 #1
  Hardware name: FUJITSU PRIMEQUEST 2800E2/D3736, BIOS PRIMEQUEST 2000 Serie5
  Call Trace:
    dump_stack+0x19/0x1b
    __setup_irq+0x54a/0x570
    request_threaded_irq+0xcc/0x170
    i801_probe+0x32f/0x508 [i2c_i801]
    local_pci_probe+0x45/0xa0
  i801_smbus 0000:00:1f.3: Failed to allocate irq 255: -16
  i801_smbus: probe of 0000:00:1f.3 failed with error -16

After aeb8a3d16ae0 ("i2c: i801: Check if interrupts are disabled"),
i801_probe() will fall back to polling if request_irq() fails.  But we
still need this patch because request_irq() may succeed or fail depending
on other devices in the system.  If request_irq() fails, i801_smbus will
work by falling back to polling, but if it succeeds, i801_smbus won't work
because it expects interrupts that it may not receive.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/interrupt.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0e95fcc75b2a..358076eda364 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -125,6 +125,16 @@ struct irqaction {
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
 
+/*
+ * If a (PCI) device interrupt is not connected we set dev->irq to
+ * IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we
+ * can distingiush that case from other error returns.
+ *
+ * 0x80000000 is guaranteed to be outside the available range of interrupts
+ * and easy to distinguish from other possible incorrect values.
+ */
+#define IRQ_NOTCONNECTED	(1U << 31)
+
 extern int __must_check
 request_threaded_irq(unsigned int irq, irq_handler_t handler,
 		     irq_handler_t thread_fn,
-- 
cgit v1.2.3


From 34e2c555f3e13c90e9284e23d00f03be8a6e06c5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Feb 2016 20:20:42 +0100
Subject: cpufreq: Add mechanism for registering utilization update callbacks

Introduce a mechanism by which parts of the cpufreq subsystem
("setpolicy" drivers or the core) can register callbacks to be
executed from cpufreq_update_util() which is invoked by the
scheduler's update_load_avg() on CPU utilization changes.

This allows the "setpolicy" drivers to dispense with their timers
and do all of the computations they need and frequency/voltage
adjustments in the update_load_avg() code path, among other things.

The update_load_avg() changes were suggested by Peter Zijlstra.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpufreq.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d0bf555b6bbf..704d85bf7242 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -151,6 +151,36 @@ static inline bool policy_is_shared(struct cpufreq_policy *policy)
 extern struct kobject *cpufreq_global_kobject;
 
 #ifdef CONFIG_CPU_FREQ
+void cpufreq_update_util(u64 time, unsigned long util, unsigned long max);
+
+/**
+ * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
+ * @time: Current time.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid.  Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_trigger_update(u64 time)
+{
+	cpufreq_update_util(time, ULONG_MAX, 0);
+}
+
+struct update_util_data {
+	void (*func)(struct update_util_data *data,
+		     u64 time, unsigned long util, unsigned long max);
+};
+
+void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
+
 unsigned int cpufreq_get(unsigned int cpu);
 unsigned int cpufreq_quick_get(unsigned int cpu);
 unsigned int cpufreq_quick_get_max(unsigned int cpu);
@@ -162,6 +192,10 @@ int cpufreq_update_policy(unsigned int cpu);
 bool have_governor_per_policy(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 #else
+static inline void cpufreq_update_util(u64 time, unsigned long util,
+				       unsigned long max) {}
+static inline void cpufreq_trigger_update(u64 time) {}
+
 static inline unsigned int cpufreq_get(unsigned int cpu)
 {
 	return 0;
-- 
cgit v1.2.3


From 68e80dae09033d778b98dc88e5bfe8fdade188e5 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Feb 2016 09:01:35 +0530
Subject: Revert "cpufreq: Drop rwsem lock around CPUFREQ_GOV_POLICY_EXIT"

Earlier, when the struct freq-attr was used to represent governor
attributes, the standard cpufreq show/store sysfs attribute callbacks
were applied to the governor tunable attributes and they always acquire
the policy->rwsem lock before carrying out the operation.  That could
have resulted in an ABBA deadlock if governor tunable attributes are
removed under policy->rwsem while one of them is being accessed
concurrently (if sysfs attributes removal wins the race, it will wait
for the access to complete with policy->rwsem held while the attribute
callback will block on policy->rwsem indefinitely).

We attempted to address this issue by dropping policy->rwsem around
governor tunable attributes removal (that is, around invocations of the
->governor callback with the event arg equal to CPUFREQ_GOV_POLICY_EXIT)
in cpufreq_set_policy(), but that opened up race conditions that had not
been possible with policy->rwsem held all the time.

The previous commit, "cpufreq: governor: New sysfs show/store callbacks
for governor tunables", fixed the original ABBA deadlock by adding new
governor specific show/store callbacks.

We don't have to drop rwsem around invocations of governor event
CPUFREQ_GOV_POLICY_EXIT anymore, and original fix can be reverted now.

Fixes: 955ef4833574 (cpufreq: Drop rwsem lock around CPUFREQ_GOV_POLICY_EXIT)
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reported-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 704d85bf7242..cac3d1ba8200 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -100,10 +100,6 @@ struct cpufreq_policy {
 	 * - Any routine that will write to the policy structure and/or may take away
 	 *   the policy altogether (eg. CPU hotplug), will hold this lock in write
 	 *   mode before doing so.
-	 *
-	 * Additional rules:
-	 * - Lock should not be held across
-	 *     __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT);
 	 */
 	struct rw_semaphore	rwsem;
 
-- 
cgit v1.2.3


From 242aa883a64d8c54cfeee47f3603b21bc705e081 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 22 Feb 2016 16:36:44 +0530
Subject: cpufreq: Remove 'policy->governor_enabled'

The entire sequence of events (like INIT/START or STOP/EXIT) for which
cpufreq_governor() is called, is guaranteed to be protected by
policy->rwsem now.

The additional checks that were added earlier (as we were forced to drop
policy->rwsem before calling cpufreq_governor() for EXIT event), aren't
required anymore.

Over that, they weren't sufficient really. They just take care of
START/STOP events, but not INIT/EXIT and the state machine was never
maintained properly by them.

Kill the unnecessary checks and policy->governor_enabled field.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index cac3d1ba8200..a50c5b2e3bf2 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -80,7 +80,6 @@ struct cpufreq_policy {
 	unsigned int		last_policy; /* policy before unplug */
 	struct cpufreq_governor	*governor; /* see below */
 	void			*governor_data;
-	bool			governor_enabled; /* governor start/stop flag */
 	char			last_governor[CPUFREQ_NAME_LEN]; /* last governor used */
 
 	struct work_struct	update; /* if update_policy() needs to be
-- 
cgit v1.2.3


From adaf9fcd136970e480d7ca834c0cf25ce922ea74 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 10 Mar 2016 20:44:47 +0100
Subject: cpufreq: Move scheduler-related code to the sched directory

Create cpufreq.c under kernel/sched/ and move the cpufreq code
related to the scheduler to that file and to sched.h.

Redefine cpufreq_update_util() as a static inline function to avoid
function calls at its call sites in the scheduler code (as suggested
by Peter Zijlstra).

Also move the definition of struct update_util_data and declaration
of cpufreq_set_update_util_data() from include/linux/cpufreq.h to
include/linux/sched.h.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/cpufreq.h | 34 ----------------------------------
 include/linux/sched.h   |  9 +++++++++
 2 files changed, 9 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index a50c5b2e3bf2..a5ea52f793f3 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -146,36 +146,6 @@ static inline bool policy_is_shared(struct cpufreq_policy *policy)
 extern struct kobject *cpufreq_global_kobject;
 
 #ifdef CONFIG_CPU_FREQ
-void cpufreq_update_util(u64 time, unsigned long util, unsigned long max);
-
-/**
- * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
- * @time: Current time.
- *
- * The way cpufreq is currently arranged requires it to evaluate the CPU
- * performance state (frequency/voltage) on a regular basis to prevent it from
- * being stuck in a completely inadequate performance level for too long.
- * That is not guaranteed to happen if the updates are only triggered from CFS,
- * though, because they may not be coming in if RT or deadline tasks are active
- * all the time (or there are RT and DL tasks only).
- *
- * As a workaround for that issue, this function is called by the RT and DL
- * sched classes to trigger extra cpufreq updates to prevent it from stalling,
- * but that really is a band-aid.  Going forward it should be replaced with
- * solutions targeted more specifically at RT and DL tasks.
- */
-static inline void cpufreq_trigger_update(u64 time)
-{
-	cpufreq_update_util(time, ULONG_MAX, 0);
-}
-
-struct update_util_data {
-	void (*func)(struct update_util_data *data,
-		     u64 time, unsigned long util, unsigned long max);
-};
-
-void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
-
 unsigned int cpufreq_get(unsigned int cpu);
 unsigned int cpufreq_quick_get(unsigned int cpu);
 unsigned int cpufreq_quick_get_max(unsigned int cpu);
@@ -187,10 +157,6 @@ int cpufreq_update_policy(unsigned int cpu);
 bool have_governor_per_policy(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 #else
-static inline void cpufreq_update_util(u64 time, unsigned long util,
-				       unsigned long max) {}
-static inline void cpufreq_trigger_update(u64 time) {}
-
 static inline unsigned int cpufreq_get(unsigned int cpu)
 {
 	return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a94cc3..913e755ef7b8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3207,4 +3207,13 @@ static inline unsigned long rlimit_max(unsigned int limit)
 	return task_rlimit_max(current, limit);
 }
 
+#ifdef CONFIG_CPU_FREQ
+struct update_util_data {
+	void (*func)(struct update_util_data *data,
+		     u64 time, unsigned long util, unsigned long max);
+};
+
+void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
+#endif /* CONFIG_CPU_FREQ */
+
 #endif
-- 
cgit v1.2.3