From 818489e511a3a38d4c9c120675e0700608925800 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <Sudeep.Holla@arm.com>
Date: Mon, 18 Jun 2018 15:34:22 +0100
Subject: [PATCH 01/35] ARM: cpuidle: silence error on driver registration
 failure

It's perfectly fine to have multiple cpuidle driver compiled in the
build configuration. However, it's not good to throw error on driver
registration failure if some other driver is already initialised and
assigned. In such cases, __cpuidle_register_driver returns -EBUSY and
we can check for such error before throwing the error.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle-arm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c
index e07bc7ace774..073557f433eb 100644
--- a/drivers/cpuidle/cpuidle-arm.c
+++ b/drivers/cpuidle/cpuidle-arm.c
@@ -105,7 +105,8 @@ static int __init arm_idle_init_cpu(int cpu)
 
 	ret = cpuidle_register_driver(drv);
 	if (ret) {
-		pr_err("Failed to register cpuidle driver\n");
+		if (ret != -EBUSY)
+			pr_err("Failed to register cpuidle driver\n");
 		goto out_kfree_drv;
 	}
 

From 1111b7836c80ec4094318e1dfb3a5abe6df95afb Mon Sep 17 00:00:00 2001
From: Xie Yisheng <xieyisheng1@huawei.com>
Date: Thu, 31 May 2018 19:11:15 +0800
Subject: [PATCH 02/35] cpufreq: intel_pstate: use match_string() helper

match_string() returns the index of an array for a matching string,
which can be used instead of open coded variant.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index ece120da3353..a5c368425e36 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -657,21 +657,18 @@ static ssize_t store_energy_performance_preference(
 {
 	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
 	char str_preference[21];
-	int ret, i = 0;
+	int ret;
 
 	ret = sscanf(buf, "%20s", str_preference);
 	if (ret != 1)
 		return -EINVAL;
 
-	while (energy_perf_strings[i] != NULL) {
-		if (!strcmp(str_preference, energy_perf_strings[i])) {
-			intel_pstate_set_energy_pref_index(cpu_data, i);
-			return count;
-		}
-		++i;
-	}
+	ret = match_string(energy_perf_strings, -1, str_preference);
+	if (ret < 0)
+		return ret;
 
-	return -EINVAL;
+	intel_pstate_set_energy_pref_index(cpu_data, ret);
+	return count;
 }
 
 static ssize_t show_energy_performance_preference(

From a1d0015423920672c3f32e69bd9feef2ec629b57 Mon Sep 17 00:00:00 2001
From: Bastian Stender <bst@pengutronix.de>
Date: Fri, 8 Jun 2018 11:06:39 +0200
Subject: [PATCH 03/35] cpufreq: imx6q/thermal: imx: register cooling device
 depending on OF

The cooling device should be part of the i.MX cpufreq driver, but it
cannot be removed for the sake of DT stability. So turn the cooling
device registration into a separate function and perform the
registration only if the CPU OF node does not have the #cooling-cells
property.

Use of_cpufreq_power_cooling_register in imx_thermal code to link the
cooling device to the device tree node provided.

This makes it possible to bind the cpufreq cooling device to a custom
thermal zone via a cooling-maps entry like:

	cooling-maps {
		map0 {
			trip = <&board_alert>;
			cooling-device = <&cpu0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
		};
	};

Assuming a cpu node exists with label "cpu0" and #cooling-cells
property.

Signed-off-by: Bastian Stender <bst@pengutronix.de>
Reviewed-by: Lucas Stach <l.stach@pengutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/imx6q-cpufreq.c | 21 +++++++++++++++++++++
 drivers/thermal/imx_thermal.c   | 28 ++++++++++++++++++++++++----
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 8b3c2a79ad6c..b2ff423ad7f8 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -9,6 +9,7 @@
 #include <linux/clk.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
+#include <linux/cpu_cooling.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/of.h>
@@ -50,6 +51,7 @@ static struct clk_bulk_data clks[] = {
 };
 
 static struct device *cpu_dev;
+static struct thermal_cooling_device *cdev;
 static bool free_opp;
 static struct cpufreq_frequency_table *freq_table;
 static unsigned int max_freq;
@@ -191,6 +193,16 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index)
 	return 0;
 }
 
+static void imx6q_cpufreq_ready(struct cpufreq_policy *policy)
+{
+	cdev = of_cpufreq_cooling_register(policy);
+
+	if (!cdev)
+		dev_err(cpu_dev,
+			"running cpufreq without cooling device: %ld\n",
+			PTR_ERR(cdev));
+}
+
 static int imx6q_cpufreq_init(struct cpufreq_policy *policy)
 {
 	int ret;
@@ -202,13 +214,22 @@ static int imx6q_cpufreq_init(struct cpufreq_policy *policy)
 	return ret;
 }
 
+static int imx6q_cpufreq_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_cooling_unregister(cdev);
+
+	return 0;
+}
+
 static struct cpufreq_driver imx6q_cpufreq_driver = {
 	.flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK,
 	.verify = cpufreq_generic_frequency_table_verify,
 	.target_index = imx6q_set_target,
 	.get = cpufreq_generic_get,
 	.init = imx6q_cpufreq_init,
+	.exit = imx6q_cpufreq_exit,
 	.name = "imx6q-cpufreq",
+	.ready = imx6q_cpufreq_ready,
 	.attr = cpufreq_generic_attr,
 	.suspend = cpufreq_generic_suspend,
 };
diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c
index 334d98be03b9..cbfcca828cd7 100644
--- a/drivers/thermal/imx_thermal.c
+++ b/drivers/thermal/imx_thermal.c
@@ -3,6 +3,7 @@
 // Copyright 2013 Freescale Semiconductor, Inc.
 
 #include <linux/clk.h>
+#include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/cpu_cooling.h>
 #include <linux/delay.h>
@@ -644,6 +645,27 @@ static const struct of_device_id of_imx_thermal_match[] = {
 };
 MODULE_DEVICE_TABLE(of, of_imx_thermal_match);
 
+/*
+ * Create cooling device in case no #cooling-cells property is available in
+ * CPU node
+ */
+static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data)
+{
+	struct device_node *np = of_get_cpu_node(data->policy->cpu, NULL);
+	int ret;
+
+	if (!np || !of_find_property(np, "#cooling-cells", NULL)) {
+		data->cdev = cpufreq_cooling_register(data->policy);
+		if (IS_ERR(data->cdev)) {
+			ret = PTR_ERR(data->cdev);
+			cpufreq_cpu_put(data->policy);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
 static int imx_thermal_probe(struct platform_device *pdev)
 {
 	struct imx_thermal_data *data;
@@ -724,12 +746,10 @@ static int imx_thermal_probe(struct platform_device *pdev)
 		return -EPROBE_DEFER;
 	}
 
-	data->cdev = cpufreq_cooling_register(data->policy);
-	if (IS_ERR(data->cdev)) {
-		ret = PTR_ERR(data->cdev);
+	ret = imx_thermal_register_legacy_cooling(data);
+	if (ret) {
 		dev_err(&pdev->dev,
 			"failed to register cpufreq cooling device: %d\n", ret);
-		cpufreq_cpu_put(data->policy);
 		return ret;
 	}
 

From d5641c64c48f4408500a348301bff01fbf2c1ec5 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Mon, 25 Jun 2018 13:30:58 +0800
Subject: [PATCH 04/35] PM / hibernate: cast PAGE_SIZE to int when comparing
 with error code

If PAGE_SIZE is unsigned type then negative error code will be
larger than PAGE_SIZE.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/swap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index c2bcf97d24c8..d7f6c1a288d3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -923,7 +923,7 @@ int swsusp_write(unsigned int flags)
 	}
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
 	error = snapshot_read_next(&snapshot);
-	if (error < PAGE_SIZE) {
+	if (error < (int)PAGE_SIZE) {
 		if (error >= 0)
 			error = -EFAULT;
 
@@ -1483,7 +1483,7 @@ int swsusp_read(unsigned int *flags_p)
 
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
 	error = snapshot_write_next(&snapshot);
-	if (error < PAGE_SIZE)
+	if (error < (int)PAGE_SIZE)
 		return error < 0 ? error : -EFAULT;
 	header = (struct swsusp_info *)data_of(snapshot);
 	error = get_swap_reader(&handle, flags_p);

From 88763a5cf80ca59a7c3bea32681ce8f697d9995f Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 26 Jun 2018 12:53:29 +0200
Subject: [PATCH 05/35] powercap / idle_inject: Add an idle injection framework

Initially, the cpu_cooling device for ARM was changed by adding a new
policy inserting idle cycles. The intel_powerclamp driver does a
similar action.

Instead of implementing idle injections privately in the cpu_cooling
device, move the idle injection code in a dedicated framework and give
the opportunity to other frameworks to make use of it.

The framework relies on the smpboot kthreads which handles via its
main loop the common code for hotplugging and [un]parking.

This code was previously tested with the cpu cooling device and went
through several iterations. It results now in split code and API
exported in the header file. It was tested with the cpu cooling device
with success.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Rewrite of all comments ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/Kconfig       |  10 +
 drivers/powercap/Makefile      |   1 +
 drivers/powercap/idle_inject.c | 356 +++++++++++++++++++++++++++++++++
 include/linux/idle_inject.h    |  29 +++
 4 files changed, 396 insertions(+)
 create mode 100644 drivers/powercap/idle_inject.c
 create mode 100644 include/linux/idle_inject.h

diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig
index 85727ef6ce8e..6ac27e5908f5 100644
--- a/drivers/powercap/Kconfig
+++ b/drivers/powercap/Kconfig
@@ -29,4 +29,14 @@ config INTEL_RAPL
 	  controller, CPU core (Power Plance 0), graphics uncore (Power Plane
 	  1), etc.
 
+config IDLE_INJECT
+	bool "Idle injection framework"
+	depends on CPU_IDLE
+	default n
+	help
+	  This enables support for the idle injection framework. It
+	  provides a way to force idle periods on a set of specified
+	  CPUs for power capping. Idle period can be injected
+	  synchronously on a set of specified CPUs or alternatively
+	  on a per CPU basis.
 endif
diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile
index 0a21ef31372b..1b328854b36e 100644
--- a/drivers/powercap/Makefile
+++ b/drivers/powercap/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_POWERCAP)	+= powercap_sys.o
 obj-$(CONFIG_INTEL_RAPL) += intel_rapl.o
+obj-$(CONFIG_IDLE_INJECT) += idle_inject.o
diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c
new file mode 100644
index 000000000000..24ff2a068978
--- /dev/null
+++ b/drivers/powercap/idle_inject.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018 Linaro Limited
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * The idle injection framework provides a way to force CPUs to enter idle
+ * states for a specified fraction of time over a specified period.
+ *
+ * It relies on the smpboot kthreads feature providing common code for CPU
+ * hotplug and thread [un]parking.
+ *
+ * All of the kthreads used for idle injection are created at init time.
+ *
+ * Next, the users of the the idle injection framework provide a cpumask via
+ * its register function. The kthreads will be synchronized with respect to
+ * this cpumask.
+ *
+ * The idle + run duration is specified via separate helpers and that allows
+ * idle injection to be started.
+ *
+ * The idle injection kthreads will call play_idle() with the idle duration
+ * specified as per the above.
+ *
+ * After all of them have been woken up, a timer is set to start the next idle
+ * injection cycle.
+ *
+ * The timer interrupt handler will wake up the idle injection kthreads for
+ * all of the CPUs in the cpumask provided by the user.
+ *
+ * Idle injection is stopped synchronously and no leftover idle injection
+ * kthread activity after its completion is guaranteed.
+ *
+ * It is up to the user of this framework to provide a lock for higher-level
+ * synchronization to prevent race conditions like starting idle injection
+ * while unregistering from the framework.
+ */
+#define pr_fmt(fmt) "ii_dev: " fmt
+
+#include <linux/cpu.h>
+#include <linux/hrtimer.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smpboot.h>
+
+#include <uapi/linux/sched/types.h>
+
+/**
+ * struct idle_inject_thread - task on/off switch structure
+ * @tsk: task injecting the idle cycles
+ * @should_run: whether or not to run the task (for the smpboot kthread API)
+ */
+struct idle_inject_thread {
+	struct task_struct *tsk;
+	int should_run;
+};
+
+/**
+ * struct idle_inject_device - idle injection data
+ * @timer: idle injection period timer
+ * @idle_duration_ms: duration of CPU idle time to inject
+ * @run_duration_ms: duration of CPU run time to allow
+ * @cpumask: mask of CPUs affected by idle injection
+ */
+struct idle_inject_device {
+	struct hrtimer timer;
+	unsigned int idle_duration_ms;
+	unsigned int run_duration_ms;
+	unsigned long int cpumask[0];
+};
+
+static DEFINE_PER_CPU(struct idle_inject_thread, idle_inject_thread);
+static DEFINE_PER_CPU(struct idle_inject_device *, idle_inject_device);
+
+/**
+ * idle_inject_wakeup - Wake up idle injection threads
+ * @ii_dev: target idle injection device
+ *
+ * Every idle injection task associated with the given idle injection device
+ * and running on an online CPU will be woken up.
+ */
+static void idle_inject_wakeup(struct idle_inject_device *ii_dev)
+{
+	struct idle_inject_thread *iit;
+	unsigned int cpu;
+
+	for_each_cpu_and(cpu, to_cpumask(ii_dev->cpumask), cpu_online_mask) {
+		iit = per_cpu_ptr(&idle_inject_thread, cpu);
+		iit->should_run = 1;
+		wake_up_process(iit->tsk);
+	}
+}
+
+/**
+ * idle_inject_timer_fn - idle injection timer function
+ * @timer: idle injection hrtimer
+ *
+ * This function is called when the idle injection timer expires.  It wakes up
+ * idle injection tasks associated with the timer and they, in turn, invoke
+ * play_idle() to inject a specified amount of CPU idle time.
+ *
+ * Return: HRTIMER_RESTART.
+ */
+static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
+{
+	unsigned int duration_ms;
+	struct idle_inject_device *ii_dev =
+		container_of(timer, struct idle_inject_device, timer);
+
+	duration_ms = READ_ONCE(ii_dev->run_duration_ms);
+	duration_ms += READ_ONCE(ii_dev->idle_duration_ms);
+
+	idle_inject_wakeup(ii_dev);
+
+	hrtimer_forward_now(timer, ms_to_ktime(duration_ms));
+
+	return HRTIMER_RESTART;
+}
+
+/**
+ * idle_inject_fn - idle injection work function
+ * @cpu: the CPU owning the task
+ *
+ * This function calls play_idle() to inject a specified amount of CPU idle
+ * time.
+ */
+static void idle_inject_fn(unsigned int cpu)
+{
+	struct idle_inject_device *ii_dev;
+	struct idle_inject_thread *iit;
+
+	ii_dev = per_cpu(idle_inject_device, cpu);
+	iit = per_cpu_ptr(&idle_inject_thread, cpu);
+
+	/*
+	 * Let the smpboot main loop know that the task should not run again.
+	 */
+	iit->should_run = 0;
+
+	play_idle(READ_ONCE(ii_dev->idle_duration_ms));
+}
+
+/**
+ * idle_inject_set_duration - idle and run duration update helper
+ * @run_duration_ms: CPU run time to allow in milliseconds
+ * @idle_duration_ms: CPU idle time to inject in milliseconds
+ */
+void idle_inject_set_duration(struct idle_inject_device *ii_dev,
+			      unsigned int run_duration_ms,
+			      unsigned int idle_duration_ms)
+{
+	if (run_duration_ms && idle_duration_ms) {
+		WRITE_ONCE(ii_dev->run_duration_ms, run_duration_ms);
+		WRITE_ONCE(ii_dev->idle_duration_ms, idle_duration_ms);
+	}
+}
+
+/**
+ * idle_inject_get_duration - idle and run duration retrieval helper
+ * @run_duration_ms: memory location to store the current CPU run time
+ * @idle_duration_ms: memory location to store the current CPU idle time
+ */
+void idle_inject_get_duration(struct idle_inject_device *ii_dev,
+			      unsigned int *run_duration_ms,
+			      unsigned int *idle_duration_ms)
+{
+	*run_duration_ms = READ_ONCE(ii_dev->run_duration_ms);
+	*idle_duration_ms = READ_ONCE(ii_dev->idle_duration_ms);
+}
+
+/**
+ * idle_inject_start - start idle injections
+ * @ii_dev: idle injection control device structure
+ *
+ * The function starts idle injection by first waking up all of the idle
+ * injection kthreads associated with @ii_dev to let them inject CPU idle time
+ * sets up a timer to start the next idle injection period.
+ *
+ * Return: -EINVAL if the CPU idle or CPU run time is not set or 0 on success.
+ */
+int idle_inject_start(struct idle_inject_device *ii_dev)
+{
+	unsigned int idle_duration_ms = READ_ONCE(ii_dev->idle_duration_ms);
+	unsigned int run_duration_ms = READ_ONCE(ii_dev->run_duration_ms);
+
+	if (!idle_duration_ms || !run_duration_ms)
+		return -EINVAL;
+
+	pr_debug("Starting injecting idle cycles on CPUs '%*pbl'\n",
+		 cpumask_pr_args(to_cpumask(ii_dev->cpumask)));
+
+	idle_inject_wakeup(ii_dev);
+
+	hrtimer_start(&ii_dev->timer,
+		      ms_to_ktime(idle_duration_ms + run_duration_ms),
+		      HRTIMER_MODE_REL);
+
+	return 0;
+}
+
+/**
+ * idle_inject_stop - stops idle injections
+ * @ii_dev: idle injection control device structure
+ *
+ * The function stops idle injection and waits for the threads to finish work.
+ * If CPU idle time is being injected when this function runs, then it will
+ * wait until the end of the cycle.
+ *
+ * When it returns, there is no more idle injection kthread activity.  The
+ * kthreads are scheduled out and the periodic timer is off.
+ */
+void idle_inject_stop(struct idle_inject_device *ii_dev)
+{
+	struct idle_inject_thread *iit;
+	unsigned int cpu;
+
+	pr_debug("Stopping idle injection on CPUs '%*pbl'\n",
+		 cpumask_pr_args(to_cpumask(ii_dev->cpumask)));
+
+	hrtimer_cancel(&ii_dev->timer);
+
+	/*
+	 * Stopping idle injection requires all of the idle injection kthreads
+	 * associated with the given cpumask to be parked and stay that way, so
+	 * prevent CPUs from going online at this point.  Any CPUs going online
+	 * after the loop below will be covered by clearing the should_run flag
+	 * that will cause the smpboot main loop to schedule them out.
+	 */
+	cpu_hotplug_disable();
+
+	/*
+	 * Iterate over all (online + offline) CPUs here in case one of them
+	 * goes offline with the should_run flag set so as to prevent its idle
+	 * injection kthread from running when the CPU goes online again after
+	 * the ii_dev has been freed.
+	 */
+	for_each_cpu(cpu, to_cpumask(ii_dev->cpumask)) {
+		iit = per_cpu_ptr(&idle_inject_thread, cpu);
+		iit->should_run = 0;
+
+		wait_task_inactive(iit->tsk, 0);
+	}
+
+	cpu_hotplug_enable();
+}
+
+/**
+ * idle_inject_setup - prepare the current task for idle injection
+ * @cpu: not used
+ *
+ * Called once, this function is in charge of setting the current task's
+ * scheduler parameters to make it an RT task.
+ */
+static void idle_inject_setup(unsigned int cpu)
+{
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+}
+
+/**
+ * idle_inject_should_run - function helper for the smpboot API
+ * @cpu: CPU the kthread is running on
+ *
+ * Return: whether or not the thread can run.
+ */
+static int idle_inject_should_run(unsigned int cpu)
+{
+	struct idle_inject_thread *iit =
+		per_cpu_ptr(&idle_inject_thread, cpu);
+
+	return iit->should_run;
+}
+
+/**
+ * idle_inject_register - initialize idle injection on a set of CPUs
+ * @cpumask: CPUs to be affected by idle injection
+ *
+ * This function creates an idle injection control device structure for the
+ * given set of CPUs and initializes the timer associated with it.  It does not
+ * start any injection cycles.
+ *
+ * Return: NULL if memory allocation fails, idle injection control device
+ * pointer on success.
+ */
+struct idle_inject_device *idle_inject_register(struct cpumask *cpumask)
+{
+	struct idle_inject_device *ii_dev;
+	int cpu, cpu_rb;
+
+	ii_dev = kzalloc(sizeof(*ii_dev) + cpumask_size(), GFP_KERNEL);
+	if (!ii_dev)
+		return NULL;
+
+	cpumask_copy(to_cpumask(ii_dev->cpumask), cpumask);
+	hrtimer_init(&ii_dev->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	ii_dev->timer.function = idle_inject_timer_fn;
+
+	for_each_cpu(cpu, to_cpumask(ii_dev->cpumask)) {
+
+		if (per_cpu(idle_inject_device, cpu)) {
+			pr_err("cpu%d is already registered\n", cpu);
+			goto out_rollback;
+		}
+
+		per_cpu(idle_inject_device, cpu) = ii_dev;
+	}
+
+	return ii_dev;
+
+out_rollback:
+	for_each_cpu(cpu_rb, to_cpumask(ii_dev->cpumask)) {
+		if (cpu == cpu_rb)
+			break;
+		per_cpu(idle_inject_device, cpu_rb) = NULL;
+	}
+
+	kfree(ii_dev);
+
+	return NULL;
+}
+
+/**
+ * idle_inject_unregister - unregister idle injection control device
+ * @ii_dev: idle injection control device to unregister
+ *
+ * The function stops idle injection for the given control device,
+ * unregisters its kthreads and frees memory allocated when that device was
+ * created.
+ */
+void idle_inject_unregister(struct idle_inject_device *ii_dev)
+{
+	unsigned int cpu;
+
+	idle_inject_stop(ii_dev);
+
+	for_each_cpu(cpu, to_cpumask(ii_dev->cpumask))
+		per_cpu(idle_inject_device, cpu) = NULL;
+
+	kfree(ii_dev);
+}
+
+static struct smp_hotplug_thread idle_inject_threads = {
+	.store = &idle_inject_thread.tsk,
+	.setup = idle_inject_setup,
+	.thread_fn = idle_inject_fn,
+	.thread_comm = "idle_inject/%u",
+	.thread_should_run = idle_inject_should_run,
+};
+
+static int __init idle_inject_init(void)
+{
+	return smpboot_register_percpu_thread(&idle_inject_threads);
+}
+early_initcall(idle_inject_init);
diff --git a/include/linux/idle_inject.h b/include/linux/idle_inject.h
new file mode 100644
index 000000000000..bdc0293fb6cb
--- /dev/null
+++ b/include/linux/idle_inject.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Linaro Ltd
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ */
+#ifndef __IDLE_INJECT_H__
+#define __IDLE_INJECT_H__
+
+/* private idle injection device structure */
+struct idle_inject_device;
+
+struct idle_inject_device *idle_inject_register(struct cpumask *cpumask);
+
+void idle_inject_unregister(struct idle_inject_device *ii_dev);
+
+int idle_inject_start(struct idle_inject_device *ii_dev);
+
+void idle_inject_stop(struct idle_inject_device *ii_dev);
+
+void idle_inject_set_duration(struct idle_inject_device *ii_dev,
+				 unsigned int run_duration_ms,
+				 unsigned int idle_duration_ms);
+
+void idle_inject_get_duration(struct idle_inject_device *ii_dev,
+				 unsigned int *run_duration_ms,
+				 unsigned int *idle_duration_ms);
+#endif /* __IDLE_INJECT_H__ */

From 6f9db69ad93cd6ab77d5571cf748ff7cdcfb0285 Mon Sep 17 00:00:00 2001
From: Tristian Celestin <tristiancelestin@fastmail.com>
Date: Fri, 15 Jun 2018 04:50:18 -0400
Subject: [PATCH 06/35] ACPI / PM: Default to s2idle in all machines supporting
 LP S0

The Dell Venue Pro 7140 supports the Low Power S0 Idle state, but
does not support any of the _DSM functions that the current heuristic
checks for.

Since suspend-to-mem can not be safely performed on this machine,
and since the bitfield check can't cover this case, it is safer
to enable s2idle by default by checking for the presence of the
_DSM alone and removing the bitfield check.

Signed-off-by: Tristian Celestin <tristiancelestin@fastmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/sleep.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 5d0486f1cfcd..06ba8cc68889 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -718,9 +718,6 @@ static const struct acpi_device_id lps0_device_ids[] = {
 #define ACPI_LPS0_ENTRY		5
 #define ACPI_LPS0_EXIT		6
 
-#define ACPI_LPS0_SCREEN_MASK	((1 << ACPI_LPS0_SCREEN_OFF) | (1 << ACPI_LPS0_SCREEN_ON))
-#define ACPI_LPS0_PLATFORM_MASK	((1 << ACPI_LPS0_ENTRY) | (1 << ACPI_LPS0_EXIT))
-
 static acpi_handle lps0_device_handle;
 static guid_t lps0_dsm_guid;
 static char lps0_dsm_func_mask;
@@ -924,17 +921,14 @@ static int lps0_device_attach(struct acpi_device *adev,
 	if (out_obj && out_obj->type == ACPI_TYPE_BUFFER) {
 		char bitmask = *(char *)out_obj->buffer.pointer;
 
-		if ((bitmask & ACPI_LPS0_PLATFORM_MASK) == ACPI_LPS0_PLATFORM_MASK ||
-		    (bitmask & ACPI_LPS0_SCREEN_MASK) == ACPI_LPS0_SCREEN_MASK) {
-			lps0_dsm_func_mask = bitmask;
-			lps0_device_handle = adev->handle;
-			/*
-			 * Use suspend-to-idle by default if the default
-			 * suspend mode was not set from the command line.
-			 */
-			if (mem_sleep_default > PM_SUSPEND_MEM)
-				mem_sleep_current = PM_SUSPEND_TO_IDLE;
-		}
+		lps0_dsm_func_mask = bitmask;
+		lps0_device_handle = adev->handle;
+		/*
+		 * Use suspend-to-idle by default if the default
+		 * suspend mode was not set from the command line.
+		 */
+		if (mem_sleep_default > PM_SUSPEND_MEM)
+			mem_sleep_current = PM_SUSPEND_TO_IDLE;
 
 		acpi_handle_debug(adev->handle, "_DSM function mask: 0x%x\n",
 				  bitmask);

From 79a5c45199cd942047b1cf7bb50ff0699c2a3af0 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 29 Jun 2018 13:04:30 +0200
Subject: [PATCH 07/35] PM / Domains: dt: Add a power-domain-names property

Let's add a power-domain-names property, to allow consumer drivers to match
the power-domains specifiers via a list of power domain names. This follows
the same concept as for other similar DT bindings.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Rajendra Nayak <rnayak@codeaurora.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/devicetree/bindings/power/power_domain.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt
index 9b387f861aed..4d6bc8829468 100644
--- a/Documentation/devicetree/bindings/power/power_domain.txt
+++ b/Documentation/devicetree/bindings/power/power_domain.txt
@@ -114,18 +114,26 @@ Required properties:
  - power-domains : A list of PM domain specifiers, as defined by bindings of
 		the power controller that is the PM domain provider.
 
+Optional properties:
+ - power-domain-names : A list of power domain name strings sorted in the same
+		order as the power-domains property. Consumers drivers will use
+		power-domain-names to match power domains with power-domains
+		specifiers.
+
 Example:
 
 	leaky-device@12350000 {
 		compatible = "foo,i-leak-current";
 		reg = <0x12350000 0x1000>;
 		power-domains = <&power 0>;
+		power-domain-names = "io";
 	};
 
 	leaky-device@12351000 {
 		compatible = "foo,i-leak-current";
 		reg = <0x12351000 0x1000>;
 		power-domains = <&power 0>, <&power 1> ;
+		power-domain-names = "io", "clk";
 	};
 
 The first example above defines a typical PM domain consumer device, which is

From 5d6be70add65e3f236642ab0029e356261617cd0 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 29 Jun 2018 13:04:31 +0200
Subject: [PATCH 08/35] PM / Domains: Introduce option to attach a device by
 name to genpd

For the multiple PM domain case, let's introduce a new function called
genpd_dev_pm_attach_by_name(). This allows a device to be associated with
its PM domain through genpd, by using a name based lookup.

Note that, genpd_dev_pm_attach_by_name() shall only be called by the driver
core / PM core, similar to how the existing dev_pm_domain_attach_by_id()
makes use of genpd_dev_pm_attach_by_id(). However, this is implemented by
following changes on top.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Rajendra Nayak <rnayak@codeaurora.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 24 ++++++++++++++++++++++++
 include/linux/pm_domain.h   |  8 ++++++++
 2 files changed, 32 insertions(+)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 9e8484189034..79bdca70a81a 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2374,6 +2374,30 @@ struct device *genpd_dev_pm_attach_by_id(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(genpd_dev_pm_attach_by_id);
 
+/**
+ * genpd_dev_pm_attach_by_name - Associate a device with one of its PM domains.
+ * @dev: The device used to lookup the PM domain.
+ * @name: The name of the PM domain.
+ *
+ * Parse device's OF node to find a PM domain specifier using the
+ * power-domain-names DT property. For further description see
+ * genpd_dev_pm_attach_by_id().
+ */
+struct device *genpd_dev_pm_attach_by_name(struct device *dev, char *name)
+{
+	int index;
+
+	if (!dev->of_node)
+		return NULL;
+
+	index = of_property_match_string(dev->of_node, "power-domain-names",
+					 name);
+	if (index < 0)
+		return NULL;
+
+	return genpd_dev_pm_attach_by_id(dev, index);
+}
+
 static const struct of_device_id idle_state_match[] = {
 	{ .compatible = "domain-idle-state", },
 	{ }
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index cb8d84090cfb..03e14a38462d 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -239,6 +239,8 @@ unsigned int of_genpd_opp_to_performance_state(struct device *dev,
 int genpd_dev_pm_attach(struct device *dev);
 struct device *genpd_dev_pm_attach_by_id(struct device *dev,
 					 unsigned int index);
+struct device *genpd_dev_pm_attach_by_name(struct device *dev,
+					   char *name);
 #else /* !CONFIG_PM_GENERIC_DOMAINS_OF */
 static inline int of_genpd_add_provider_simple(struct device_node *np,
 					struct generic_pm_domain *genpd)
@@ -290,6 +292,12 @@ static inline struct device *genpd_dev_pm_attach_by_id(struct device *dev,
 	return NULL;
 }
 
+static inline struct device *genpd_dev_pm_attach_by_name(struct device *dev,
+							 char *name)
+{
+	return NULL;
+}
+
 static inline
 struct generic_pm_domain *of_genpd_remove_last(struct device_node *np)
 {

From 27dceb81f445c58b1d10d27d26eaf4ac2e9e0b00 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 29 Jun 2018 13:04:32 +0200
Subject: [PATCH 09/35] PM / Domains: Introduce dev_pm_domain_attach_by_name()

For the multiple PM domain case, let's introduce a new API called
dev_pm_domain_attach_by_name(). This allows a consumer driver to associate
its device with one of its PM domains, by using a name based lookup.

Do note that, currently it's only genpd that supports multiple PM domains
per device, but dev_pm_domain_attach_by_name() can easily by extended to
cover other PM domain types, if/when needed.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Rajendra Nayak <rnayak@codeaurora.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/common.c | 17 +++++++++++++++++
 include/linux/pm_domain.h   |  7 +++++++
 2 files changed, 24 insertions(+)

diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c
index df41b4780b3b..b413951c6abc 100644
--- a/drivers/base/power/common.c
+++ b/drivers/base/power/common.c
@@ -152,6 +152,23 @@ struct device *dev_pm_domain_attach_by_id(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_id);
 
+/**
+ * dev_pm_domain_attach_by_name - Associate a device with one of its PM domains.
+ * @dev: The device used to lookup the PM domain.
+ * @name: The name of the PM domain.
+ *
+ * For a detailed function description, see dev_pm_domain_attach_by_id().
+ */
+struct device *dev_pm_domain_attach_by_name(struct device *dev,
+					    char *name)
+{
+	if (dev->pm_domain)
+		return ERR_PTR(-EEXIST);
+
+	return genpd_dev_pm_attach_by_name(dev, name);
+}
+EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_name);
+
 /**
  * dev_pm_domain_detach - Detach a device from its PM domain.
  * @dev: Device to detach.
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 03e14a38462d..776c546d581a 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -309,6 +309,8 @@ struct generic_pm_domain *of_genpd_remove_last(struct device_node *np)
 int dev_pm_domain_attach(struct device *dev, bool power_on);
 struct device *dev_pm_domain_attach_by_id(struct device *dev,
 					  unsigned int index);
+struct device *dev_pm_domain_attach_by_name(struct device *dev,
+					    char *name);
 void dev_pm_domain_detach(struct device *dev, bool power_off);
 void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd);
 #else
@@ -321,6 +323,11 @@ static inline struct device *dev_pm_domain_attach_by_id(struct device *dev,
 {
 	return NULL;
 }
+static inline struct device *dev_pm_domain_attach_by_name(struct device *dev,
+							  char *name)
+{
+	return NULL;
+}
 static inline void dev_pm_domain_detach(struct device *dev, bool power_off) {}
 static inline void dev_pm_domain_set(struct device *dev,
 				     struct dev_pm_domain *pd) {}

From e88728f46cfbb59cc7e7acf1d230c05ec093764e Mon Sep 17 00:00:00 2001
From: Vivek Gautam <vivek.gautam@codeaurora.org>
Date: Wed, 27 Jun 2018 18:20:55 +0530
Subject: [PATCH 10/35] driver core: Rename flag AUTOREMOVE to
 AUTOREMOVE_CONSUMER

Now that we want to add another flag to autoremove the device link
on supplier unbind, it's fair to rename the existing flag from
DL_FLAG_AUTOREMOVE to DL_FLAG_AUTOREMOVE_CONSUMER so that we can
add similar flag for supplier later.
And, while we are touching device.h, fix a doc build warning.

Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/driver-api/device_link.rst |  8 ++++----
 drivers/base/core.c                      | 15 ++++++++-------
 drivers/gpu/drm/tegra/dc.c               |  2 +-
 drivers/gpu/ipu-v3/ipu-pre.c             |  3 ++-
 drivers/gpu/ipu-v3/ipu-prg.c             |  3 ++-
 drivers/soc/imx/gpc.c                    |  2 +-
 include/linux/device.h                   | 12 ++++++------
 7 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/Documentation/driver-api/device_link.rst b/Documentation/driver-api/device_link.rst
index 70e328e16aad..a005b904a264 100644
--- a/Documentation/driver-api/device_link.rst
+++ b/Documentation/driver-api/device_link.rst
@@ -81,10 +81,10 @@ integration is desired.
 Two other flags are specifically targeted at use cases where the device
 link is added from the consumer's ``->probe`` callback:  ``DL_FLAG_RPM_ACTIVE``
 can be specified to runtime resume the supplier upon addition of the
-device link.  ``DL_FLAG_AUTOREMOVE`` causes the device link to be automatically
-purged when the consumer fails to probe or later unbinds.  This obviates
-the need to explicitly delete the link in the ``->remove`` callback or in
-the error path of the ``->probe`` callback.
+device link.  ``DL_FLAG_AUTOREMOVE_CONSUMER`` causes the device link to be
+automatically purged when the consumer fails to probe or later unbinds.
+This obviates the need to explicitly delete the link in the ``->remove``
+callback or in the error path of the ``->probe`` callback.
 
 Limitations
 ===========
diff --git a/drivers/base/core.c b/drivers/base/core.c
index df3e1a44707a..14c1e3151e08 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -178,10 +178,10 @@ void device_pm_move_to_tail(struct device *dev)
  * of the link.  If DL_FLAG_PM_RUNTIME is not set, DL_FLAG_RPM_ACTIVE will be
  * ignored.
  *
- * If the DL_FLAG_AUTOREMOVE is set, the link will be removed automatically
- * when the consumer device driver unbinds from it.  The combination of both
- * DL_FLAG_AUTOREMOVE and DL_FLAG_STATELESS set is invalid and will cause NULL
- * to be returned.
+ * If the DL_FLAG_AUTOREMOVE_CONSUMER is set, the link will be removed
+ * automatically when the consumer device driver unbinds from it.
+ * The combination of both DL_FLAG_AUTOREMOVE_CONSUMER and DL_FLAG_STATELESS
+ * set is invalid and will cause NULL to be returned.
  *
  * A side effect of the link creation is re-ordering of dpm_list and the
  * devices_kset list by moving the consumer device and all devices depending
@@ -198,7 +198,8 @@ struct device_link *device_link_add(struct device *consumer,
 	struct device_link *link;
 
 	if (!consumer || !supplier ||
-	    ((flags & DL_FLAG_STATELESS) && (flags & DL_FLAG_AUTOREMOVE)))
+	    ((flags & DL_FLAG_STATELESS) &&
+	     (flags & DL_FLAG_AUTOREMOVE_CONSUMER)))
 		return NULL;
 
 	device_links_write_lock();
@@ -479,7 +480,7 @@ static void __device_links_no_driver(struct device *dev)
 		if (link->flags & DL_FLAG_STATELESS)
 			continue;
 
-		if (link->flags & DL_FLAG_AUTOREMOVE)
+		if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER)
 			kref_put(&link->kref, __device_link_del);
 		else if (link->status != DL_STATE_SUPPLIER_UNBIND)
 			WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
@@ -515,7 +516,7 @@ void device_links_driver_cleanup(struct device *dev)
 		if (link->flags & DL_FLAG_STATELESS)
 			continue;
 
-		WARN_ON(link->flags & DL_FLAG_AUTOREMOVE);
+		WARN_ON(link->flags & DL_FLAG_AUTOREMOVE_CONSUMER);
 		WARN_ON(link->status != DL_STATE_SUPPLIER_UNBIND);
 		WRITE_ONCE(link->status, DL_STATE_DORMANT);
 	}
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c
index c3afe7b2237e..965088afcfad 100644
--- a/drivers/gpu/drm/tegra/dc.c
+++ b/drivers/gpu/drm/tegra/dc.c
@@ -2312,7 +2312,7 @@ static int tegra_dc_couple(struct tegra_dc *dc)
 	 * POWER_CONTROL registers during CRTC enabling.
 	 */
 	if (dc->soc->coupled_pm && dc->pipe == 1) {
-		u32 flags = DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE;
+		u32 flags = DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_CONSUMER;
 		struct device_link *link;
 		struct device *partner;
 
diff --git a/drivers/gpu/ipu-v3/ipu-pre.c b/drivers/gpu/ipu-v3/ipu-pre.c
index 0f70e8847540..2f8db9d62551 100644
--- a/drivers/gpu/ipu-v3/ipu-pre.c
+++ b/drivers/gpu/ipu-v3/ipu-pre.c
@@ -128,7 +128,8 @@ ipu_pre_lookup_by_phandle(struct device *dev, const char *name, int index)
 	list_for_each_entry(pre, &ipu_pre_list, list) {
 		if (pre_node == pre->dev->of_node) {
 			mutex_unlock(&ipu_pre_list_mutex);
-			device_link_add(dev, pre->dev, DL_FLAG_AUTOREMOVE);
+			device_link_add(dev, pre->dev,
+					DL_FLAG_AUTOREMOVE_CONSUMER);
 			of_node_put(pre_node);
 			return pre;
 		}
diff --git a/drivers/gpu/ipu-v3/ipu-prg.c b/drivers/gpu/ipu-v3/ipu-prg.c
index 83f9dd934a5d..38a3a9764e49 100644
--- a/drivers/gpu/ipu-v3/ipu-prg.c
+++ b/drivers/gpu/ipu-v3/ipu-prg.c
@@ -100,7 +100,8 @@ ipu_prg_lookup_by_phandle(struct device *dev, const char *name, int ipu_id)
 	list_for_each_entry(prg, &ipu_prg_list, list) {
 		if (prg_node == prg->dev->of_node) {
 			mutex_unlock(&ipu_prg_list_mutex);
-			device_link_add(dev, prg->dev, DL_FLAG_AUTOREMOVE);
+			device_link_add(dev, prg->dev,
+					DL_FLAG_AUTOREMOVE_CONSUMER);
 			prg->id = ipu_id;
 			of_node_put(prg_node);
 			return prg;
diff --git a/drivers/soc/imx/gpc.c b/drivers/soc/imx/gpc.c
index 32f0748fd067..aa9e65bc965e 100644
--- a/drivers/soc/imx/gpc.c
+++ b/drivers/soc/imx/gpc.c
@@ -202,7 +202,7 @@ static int imx_pgc_power_domain_probe(struct platform_device *pdev)
 			goto genpd_err;
 	}
 
-	device_link_add(dev, dev->parent, DL_FLAG_AUTOREMOVE);
+	device_link_add(dev, dev->parent, DL_FLAG_AUTOREMOVE_CONSUMER);
 
 	return 0;
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 055a69dbcd18..3929805cdd59 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -90,7 +90,7 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
  * @num_vf:	Called to find out how many virtual functions a device on this
  *		bus supports.
  * @dma_configure:	Called to setup DMA configuration on a device on
-			this bus.
+ *			this bus.
  * @pm:		Power management operations of this bus, callback the specific
  *		device driver's pm-ops.
  * @iommu_ops:  IOMMU specific operations for this bus, used to attach IOMMU
@@ -784,14 +784,14 @@ enum device_link_state {
  * Device link flags.
  *
  * STATELESS: The core won't track the presence of supplier/consumer drivers.
- * AUTOREMOVE: Remove this link automatically on consumer driver unbind.
+ * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind.
  * PM_RUNTIME: If set, the runtime PM framework will use this link.
  * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation.
  */
-#define DL_FLAG_STATELESS	BIT(0)
-#define DL_FLAG_AUTOREMOVE	BIT(1)
-#define DL_FLAG_PM_RUNTIME	BIT(2)
-#define DL_FLAG_RPM_ACTIVE	BIT(3)
+#define DL_FLAG_STATELESS		BIT(0)
+#define DL_FLAG_AUTOREMOVE_CONSUMER	BIT(1)
+#define DL_FLAG_PM_RUNTIME		BIT(2)
+#define DL_FLAG_RPM_ACTIVE		BIT(3)
 
 /**
  * struct device_link - Device link representation.

From 1689cac5b32a6db6f812e8063ea418a7cf023d03 Mon Sep 17 00:00:00 2001
From: Vivek Gautam <vivek.gautam@codeaurora.org>
Date: Wed, 27 Jun 2018 18:20:56 +0530
Subject: [PATCH 11/35] driver core: Add flag to autoremove device link on
 supplier unbind

Add a flag to autoremove the device links on supplier driver
unbind. This obviates the need to explicitly delete the link
in the remove path.
We remove these links only when the supplier's link to its
consumers has gone to DL_STATE_SUPPLIER_UNBIND state.

Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
Suggested-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/driver-api/device_link.rst |  4 ++++
 drivers/base/core.c                      | 10 ++++++++++
 include/linux/device.h                   |  2 ++
 3 files changed, 16 insertions(+)

diff --git a/Documentation/driver-api/device_link.rst b/Documentation/driver-api/device_link.rst
index a005b904a264..d6763272e747 100644
--- a/Documentation/driver-api/device_link.rst
+++ b/Documentation/driver-api/device_link.rst
@@ -86,6 +86,10 @@ automatically purged when the consumer fails to probe or later unbinds.
 This obviates the need to explicitly delete the link in the ``->remove``
 callback or in the error path of the ``->probe`` callback.
 
+Similarly, when the device link is added from supplier's ``->probe`` callback,
+``DL_FLAG_AUTOREMOVE_SUPPLIER`` causes the device link to be automatically
+purged when the supplier fails to probe or later unbinds.
+
 Limitations
 ===========
 
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 14c1e3151e08..e721218bf352 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -518,6 +518,16 @@ void device_links_driver_cleanup(struct device *dev)
 
 		WARN_ON(link->flags & DL_FLAG_AUTOREMOVE_CONSUMER);
 		WARN_ON(link->status != DL_STATE_SUPPLIER_UNBIND);
+
+		/*
+		 * autoremove the links between this @dev and its consumer
+		 * devices that are not active, i.e. where the link state
+		 * has moved to DL_STATE_SUPPLIER_UNBIND.
+		 */
+		if (link->status == DL_STATE_SUPPLIER_UNBIND &&
+		    link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
+			kref_put(&link->kref, __device_link_del);
+
 		WRITE_ONCE(link->status, DL_STATE_DORMANT);
 	}
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 3929805cdd59..e80920452b49 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -787,11 +787,13 @@ enum device_link_state {
  * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind.
  * PM_RUNTIME: If set, the runtime PM framework will use this link.
  * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation.
+ * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind.
  */
 #define DL_FLAG_STATELESS		BIT(0)
 #define DL_FLAG_AUTOREMOVE_CONSUMER	BIT(1)
 #define DL_FLAG_PM_RUNTIME		BIT(2)
 #define DL_FLAG_RPM_ACTIVE		BIT(3)
+#define DL_FLAG_AUTOREMOVE_SUPPLIER	BIT(4)
 
 /**
  * struct device_link - Device link representation.

From 231f9415001138a000cd0f881c46654b7ea3f8c5 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Mon, 9 Jul 2018 14:03:55 +0200
Subject: [PATCH 12/35] ACPI / PM: save NVS memory for ASUS 1025C laptop

Every time I tried to upgrade my laptop from 3.10.x to 4.x I faced an
issue by which the fan would run at full speed upon resume. Bisecting
it showed me the issue was introduced in 3.17 by commit 821d6f0359b0
(ACPI / sleep: Do not save NVS for new machines to accelerate S3). This
code only affects machines built starting as of 2012, but this Asus
1025C laptop was made in 2012 and apparently needs the NVS data to be
saved, otherwise the CPU's thermal state is not properly reported on
resume and the fan runs at full speed upon resume.

Here's a very simple way to check if such a machine is affected :

  # cat /sys/class/thermal/thermal_zone0/temp
  55000

  ( now suspend, wait one second and resume )

  # cat /sys/class/thermal/thermal_zone0/temp
  0

  (and after ~15 seconds the fan starts to spin)

Let's apply the same quirk as commit cbc00c13 (ACPI: save NVS memory
for Lenovo G50-45) and reuse the function it provides. Note that this
commit was already backported to 4.9.x but not 4.4.x.

Cc: 3.17+ <stable@vger.kernel.org> # 3.17+: requires cbc00c13
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/sleep.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 06ba8cc68889..754d59f95500 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -338,6 +338,14 @@ static const struct dmi_system_id acpisleep_dmi_table[] __initconst = {
 		DMI_MATCH(DMI_PRODUCT_NAME, "K54HR"),
 		},
 	},
+	{
+	.callback = init_nvs_save_s3,
+	.ident = "Asus 1025C",
+	.matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+		DMI_MATCH(DMI_PRODUCT_NAME, "1025C"),
+		},
+	},
 	/*
 	 * https://bugzilla.kernel.org/show_bug.cgi?id=189431
 	 * Lenovo G50-45 is a platform later than 2012, but needs nvs memory

From 12ba2c65f9d3d6996d1e909921151487b009dd30 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 13 Feb 2018 22:10:42 +0100
Subject: [PATCH 13/35] PM / devfreq: exynos-ppmu: Delete an error message for
 a failed memory allocation in exynos_ppmu_probe()

Omit an extra message for a memory allocation failure in this function.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/event/exynos-ppmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c
index 3cd6a184fe7c..a9c64f0d3284 100644
--- a/drivers/devfreq/event/exynos-ppmu.c
+++ b/drivers/devfreq/event/exynos-ppmu.c
@@ -627,11 +627,9 @@ static int exynos_ppmu_probe(struct platform_device *pdev)
 
 	size = sizeof(struct devfreq_event_dev *) * info->num_events;
 	info->edev = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
-	if (!info->edev) {
-		dev_err(&pdev->dev,
-			"failed to allocate memory devfreq-event devices\n");
+	if (!info->edev)
 		return -ENOMEM;
-	}
+
 	edev = info->edev;
 	platform_set_drvdata(pdev, info);
 

From 2d803dc8f7a5f622ac47c3b650834ada3a2659b9 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Fri, 30 Mar 2018 17:14:03 +0530
Subject: [PATCH 14/35] PM / devfreq: use put_device() instead of kfree()

Never directly free @dev after calling device_register() or
device_unregister(), even if device_register() returned an error.
Always use put_device() to give up the reference initialized.

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/devfreq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 0b5b3abe054e..e26adf67e218 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -625,7 +625,8 @@ struct devfreq *devfreq_add_device(struct device *dev,
 	err = device_register(&devfreq->dev);
 	if (err) {
 		mutex_unlock(&devfreq->lock);
-		goto err_dev;
+		put_device(&devfreq->dev);
+		goto err_out;
 	}
 
 	devfreq->trans_table =
@@ -672,6 +673,7 @@ err_init:
 	mutex_unlock(&devfreq_list_lock);
 
 	device_unregister(&devfreq->dev);
+	devfreq = NULL;
 err_dev:
 	if (devfreq)
 		kfree(devfreq);

From 2d2894c94a61930615cd1411cd8e0bf5b804e3a1 Mon Sep 17 00:00:00 2001
From: Nick Milner <nick.milner@collabora.com>
Date: Wed, 9 May 2018 14:57:43 +0200
Subject: [PATCH 15/35] dt-bindings: devfreq: rk3399_dmc: improve binding
 documentation.

There are several typos, references to non existent files, grammar and
punctuation mistakes in the rk3399_dmc.txt binding. This patch tries
to improve the binding documentation and fix these mistakes.

Signed-off-by: Nick Milner <nick.milner@collabora.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Rob Herring <robh@kernel.org>
- [1/5] Fix some attributes to match with the code s/_disb/_dis/
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 .../bindings/devfreq/rk3399_dmc.txt           | 189 +++++++++---------
 1 file changed, 96 insertions(+), 93 deletions(-)

diff --git a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt b/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
index fc2bcbe26b1e..1b93b33c7d38 100644
--- a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
+++ b/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
@@ -1,14 +1,14 @@
-* Rockchip rk3399 DMC(Dynamic Memory Controller) device
+* Rockchip rk3399 DMC (Dynamic Memory Controller) device
 
 Required properties:
 - compatible:		 Must be "rockchip,rk3399-dmc".
 - devfreq-events:	 Node to get DDR loading, Refer to
-			 Documentation/devicetree/bindings/devfreq/
+			 Documentation/devicetree/bindings/devfreq/event/
 			 rockchip-dfi.txt
-- interrupts:		 The interrupt number to the CPU. The interrupt
-			 specifier format depends on the interrupt controller.
-			 It should be DCF interrupts, when DDR dvfs finish,
-			 it will happen.
+- interrupts:		 The CPU interrupt number. The interrupt specifier
+			 format depends on the interrupt controller.
+			 It should be a DCF interrupt. When DDR DVFS finishes
+			 a DCF interrupt is triggered.
 - clocks:		 Phandles for clock specified in "clock-names" property
 - clock-names :		 The name of clock used by the DFI, must be
 			 "pclk_ddr_mon";
@@ -17,139 +17,142 @@ Required properties:
 - center-supply:	 DMC supply node.
 - status:		 Marks the node enabled/disabled.
 
-Following properties are ddr timing:
+Following properties relate to DDR timing:
 
 - rockchip,dram_speed_bin :	  Value reference include/dt-bindings/clock/ddr.h,
-				  it select ddr3 cl-trp-trcd type, default value
-				  "DDR3_DEFAULT".it must selected according to
-				  "Speed Bin" in ddr3 datasheet, DO NOT use
-				  smaller "Speed Bin" than ddr3 exactly is.
+				  it selects the DDR3 cl-trp-trcd type. It must be
+				  set according to "Speed Bin" in DDR3 datasheet,
+				  DO NOT use a smaller "Speed Bin" than specified
+				  for the DDR3 being used.
 
-- rockchip,pd_idle :		  Config the PD_IDLE value, defined the power-down
-				  idle period, memories are places into power-down
-				  mode if bus is idle for PD_IDLE DFI clocks.
+- rockchip,pd_idle :		  Configure the PD_IDLE value. Defines the
+				  power-down idle period in which memories are
+				  placed into power-down mode if bus is idle
+				  for PD_IDLE DFI clock cycles.
 
-- rockchip,sr_idle :		  Configure the SR_IDLE value, defined the
-				  selfrefresh idle period, memories are places
-				  into self-refresh mode if bus is idle for
-				  SR_IDLE*1024 DFI clocks (DFI clocks freq is
-				  half of dram's clocks), defaule value is "0".
+- rockchip,sr_idle :		  Configure the SR_IDLE value. Defines the
+				  self-refresh idle period in which memories are
+				  placed into self-refresh mode if bus is idle
+				  for SR_IDLE * 1024 DFI clock cycles (DFI
+				  clocks freq is half of DRAM clock), default
+				  value is "0".
 
-- rockchip,sr_mc_gate_idle :	  Defined the self-refresh with memory and
-				  controller clock gating idle period, memories
-				  are places into self-refresh mode and memory
-				  controller clock arg gating if bus is idle for
-				  sr_mc_gate_idle*1024 DFI clocks.
+- rockchip,sr_mc_gate_idle :	  Defines the memory self-refresh and controller
+				  clock gating idle period. Memories are placed
+				  into self-refresh mode and memory controller
+				  clock arg gating started if bus is idle for
+				  sr_mc_gate_idle*1024 DFI clock cycles.
 
-- rockchip,srpd_lite_idle :	  Defined the self-refresh power down idle
-				  period, memories are places into self-refresh
-				  power down mode if bus is idle for
-				  srpd_lite_idle*1024 DFI clocks. This parameter
-				  is for LPDDR4 only.
+- rockchip,srpd_lite_idle :	  Defines the self-refresh power down idle
+				  period in which memories are placed into
+				  self-refresh power down mode if bus is idle
+				  for srpd_lite_idle * 1024 DFI clock cycles.
+				  This parameter is for LPDDR4 only.
 
-- rockchip,standby_idle :	  Defined the standby idle period, memories are
-				  places into self-refresh than controller, pi,
-				  phy and dram clock will gating if bus is idle
-				  for standby_idle * DFI clocks.
+- rockchip,standby_idle :	  Defines the standby idle period in which
+				  memories are placed into self-refresh mode.
+				  The controller, pi, PHY and DRAM clock will
+				  be gated if bus is idle for standby_idle * DFI
+				  clock cycles.
 
-- rockchip,dram_dll_disb_freq :  It's defined the DDR3 dll bypass frequency in
-				  MHz, when ddr freq less than DRAM_DLL_DISB_FREQ,
-				  ddr3 dll will bypssed note: if dll was bypassed,
-				  the odt also stop working.
+- rockchip,dram_dll_dis_freq :	  Defines the DDR3 DLL bypass frequency in MHz.
+				  When DDR frequency is less than DRAM_DLL_DISB_FREQ,
+				  DDR3 DLL will be bypassed. Note: if DLL was bypassed,
+				  the odt will also stop working.
 
-- rockchip,phy_dll_disb_freq :	  Defined the PHY dll bypass frequency in
-				  MHz (Mega Hz), when ddr freq less than
-				  DRAM_DLL_DISB_FREQ, phy dll will bypssed.
-				  note: phy dll and phy odt are independent.
+- rockchip,phy_dll_dis_freq :	  Defines the PHY dll bypass frequency in
+				  MHz (Mega Hz). When DDR frequency is less than
+				  DRAM_DLL_DISB_FREQ, PHY DLL will be bypassed.
+				  Note: PHY DLL and PHY ODT are independent.
 
-- rockchip,ddr3_odt_disb_freq :  When dram type is DDR3, this parameter defined
-				  the odt disable frequency in MHz (Mega Hz),
-				  when ddr frequency less then ddr3_odt_disb_freq,
-				  the odt on dram side and controller side are
+- rockchip,ddr3_odt_dis_freq :	  When the DRAM type is DDR3, this parameter defines
+				  the ODT disable frequency in MHz (Mega Hz).
+				  when the DDR frequency is  less then ddr3_odt_dis_freq,
+				  the ODT on the DRAM side and controller side are
 				  both disabled.
 
-- rockchip,ddr3_drv :		  When dram type is DDR3, this parameter define
-				  the dram side driver stength in ohm, default
+- rockchip,ddr3_drv :		  When the DRAM type is DDR3, this parameter defines
+				  the DRAM side driver strength in ohms. Default
 				  value is DDR3_DS_40ohm.
 
-- rockchip,ddr3_odt :		  When dram type is DDR3, this parameter define
-				  the dram side ODT stength in ohm, default value
+- rockchip,ddr3_odt :		  When the DRAM type is DDR3, this parameter defines
+				  the DRAM side ODT strength in ohms. Default value
 				  is DDR3_ODT_120ohm.
 
-- rockchip,phy_ddr3_ca_drv :	  When dram type is DDR3, this parameter define
-				  the phy side CA line(incluing command line,
+- rockchip,phy_ddr3_ca_drv :	  When the DRAM type is DDR3, this parameter defines
+				  the phy side CA line (incluing command line,
 				  address line and clock line) driver strength.
 				  Default value is PHY_DRV_ODT_40.
 
-- rockchip,phy_ddr3_dq_drv :	  When dram type is DDR3, this parameter define
-				  the phy side DQ line(incluing DQS/DQ/DM line)
-				  driver strength. default value is PHY_DRV_ODT_40.
+- rockchip,phy_ddr3_dq_drv :	  When the DRAM type is DDR3, this parameter defines
+				  the PHY side DQ line (including DQS/DQ/DM line)
+				  driver strength. Default value is PHY_DRV_ODT_40.
 
-- rockchip,phy_ddr3_odt : 	  When dram type is DDR3, this parameter define the
-				  phy side odt strength, default value is
+- rockchip,phy_ddr3_odt : 	  When the DRAM type is DDR3, this parameter defines
+				  the PHY side ODT strength. Default value is
 				  PHY_DRV_ODT_240.
 
-- rockchip,lpddr3_odt_disb_freq : When dram type is LPDDR3, this parameter defined
-				  then odt disable frequency in MHz (Mega Hz),
-				  when ddr frequency less then ddr3_odt_disb_freq,
-				  the odt on dram side and controller side are
+- rockchip,lpddr3_odt_dis_freq : When the DRAM type is LPDDR3, this parameter defines
+				  then ODT disable frequency in MHz (Mega Hz).
+				  When DDR frequency is less then ddr3_odt_dis_freq,
+				  the ODT on the DRAM side and controller side are
 				  both disabled.
 
-- rockchip,lpddr3_drv :	  When dram type is LPDDR3, this parameter define
-				  the dram side driver stength in ohm, default
+- rockchip,lpddr3_drv :		  When the DRAM type is LPDDR3, this parameter defines
+				  the DRAM side driver strength in ohms. Default
 				  value is LP3_DS_34ohm.
 
-- rockchip,lpddr3_odt :	  When dram type is LPDDR3, this parameter define
-				  the dram side ODT stength in ohm, default value
+- rockchip,lpddr3_odt :		  When the DRAM type is LPDDR3, this parameter defines
+				  the DRAM side ODT strength in ohms. Default value
 				  is LP3_ODT_240ohm.
 
-- rockchip,phy_lpddr3_ca_drv :	  When dram type is LPDDR3, this parameter define
-				  the phy side CA line(incluing command line,
+- rockchip,phy_lpddr3_ca_drv :	  When the DRAM type is LPDDR3, this parameter defines
+				  the PHY side CA line (including command line,
 				  address line and clock line) driver strength.
-				  default value is PHY_DRV_ODT_40.
+				  Default value is PHY_DRV_ODT_40.
 
-- rockchip,phy_lpddr3_dq_drv :	  When dram type is LPDDR3, this parameter define
-				  the phy side DQ line(incluing DQS/DQ/DM line)
-				  driver strength. default value is
+- rockchip,phy_lpddr3_dq_drv :	  When the DRAM type is LPDDR3, this parameter defines
+				  the PHY side DQ line (including DQS/DQ/DM line)
+				  driver strength. Default value is
 				  PHY_DRV_ODT_40.
 
 - rockchip,phy_lpddr3_odt : 	  When dram type is LPDDR3, this parameter define
 				  the phy side odt strength, default value is
 				  PHY_DRV_ODT_240.
 
-- rockchip,lpddr4_odt_disb_freq : When dram type is LPDDR4, this parameter
-				  defined the odt disable frequency in
-				  MHz (Mega Hz), when ddr frequency less then
-				  ddr3_odt_disb_freq, the odt on dram side and
+- rockchip,lpddr4_odt_dis_freq : When the DRAM type is LPDDR4, this parameter
+				  defines the ODT disable frequency in
+				  MHz (Mega Hz). When the DDR frequency is less then
+				  ddr3_odt_dis_freq, the ODT on the DRAM side and
 				  controller side are both disabled.
 
-- rockchip,lpddr4_drv :	  When dram type is LPDDR4, this parameter define
-				  the dram side driver stength in ohm, default
+- rockchip,lpddr4_drv :		  When the DRAM type is LPDDR4, this parameter defines
+				  the DRAM side driver strength in ohms. Default
 				  value is LP4_PDDS_60ohm.
 
-- rockchip,lpddr4_dq_odt : 	  When dram type is LPDDR4, this parameter define
-				  the dram side ODT on dqs/dq line stength in ohm,
-				  default value is LP4_DQ_ODT_40ohm.
+- rockchip,lpddr4_dq_odt : 	  When the DRAM type is LPDDR4, this parameter defines
+				  the DRAM side ODT on DQS/DQ line strength in ohms.
+				  Default value is LP4_DQ_ODT_40ohm.
 
-- rockchip,lpddr4_ca_odt :	  When dram type is LPDDR4, this parameter define
-				  the dram side ODT on ca line stength in ohm,
-				  default value is LP4_CA_ODT_40ohm.
+- rockchip,lpddr4_ca_odt :	  When the DRAM type is LPDDR4, this parameter defines
+				  the DRAM side ODT on CA line strength in ohms.
+				  Default value is LP4_CA_ODT_40ohm.
 
-- rockchip,phy_lpddr4_ca_drv :	  When dram type is LPDDR4, this parameter define
-				  the phy side  CA line(incluing command address
-				  line) driver strength. default value is
+- rockchip,phy_lpddr4_ca_drv :	  When the DRAM type is LPDDR4, this parameter defines
+				  the PHY side CA line (including command address
+				  line) driver strength. Default value is
 				  PHY_DRV_ODT_40.
 
-- rockchip,phy_lpddr4_ck_cs_drv : When dram type is LPDDR4, this parameter define
-				  the phy side clock line and cs line driver
-				  strength. default value is PHY_DRV_ODT_80.
+- rockchip,phy_lpddr4_ck_cs_drv : When the DRAM type is LPDDR4, this parameter defines
+				  the PHY side clock line and CS line driver
+				  strength. Default value is PHY_DRV_ODT_80.
 
-- rockchip,phy_lpddr4_dq_drv :	  When dram type is LPDDR4, this parameter define
-				  the phy side DQ line(incluing DQS/DQ/DM line)
-				  driver strength. default value is PHY_DRV_ODT_80.
+- rockchip,phy_lpddr4_dq_drv :	  When the DRAM type is LPDDR4, this parameter defines
+				  the PHY side DQ line (including DQS/DQ/DM line)
+				  driver strength. Default value is PHY_DRV_ODT_80.
 
-- rockchip,phy_lpddr4_odt :	  When dram type is LPDDR4, this parameter define
-				  the phy side odt strength, default value is
+- rockchip,phy_lpddr4_odt :	  When the DRAM type is LPDDR4, this parameter defines
+				  the PHY side ODT strength. Default value is
 				  PHY_DRV_ODT_60.
 
 Example:

From 49502b23e0e45b57b776490cfdfd95ea7fecd7c7 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Wed, 9 May 2018 14:57:44 +0200
Subject: [PATCH 16/35] dt-bindings: clock: add rk3399 DDR3 standard speed
 bins.

DDR3 SDRAM Standard (JESD79-3F) defines some standard speed bins for
DDR3 memories. The rk3399_dmc driver allows you to pass these values via
the device tree. For that purpose the devfreq/rk3399_dmc.txt binding
refers to a ddr.h file which does not exist. This patch adds the missing
defines in a include file called rk3399-ddr.h with the definition of
standard speed bins according to the ARM Trusted Firmware (ATF).

Fixes: c1ceb8f7c167 (Documentation: bindings: add dt documentation for rk3399 dmc)
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 .../bindings/devfreq/rk3399_dmc.txt           |  2 +-
 include/dt-bindings/clock/rk3399-ddr.h        | 56 +++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 include/dt-bindings/clock/rk3399-ddr.h

diff --git a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt b/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
index 1b93b33c7d38..e0030a139383 100644
--- a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
+++ b/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
@@ -19,7 +19,7 @@ Required properties:
 
 Following properties relate to DDR timing:
 
-- rockchip,dram_speed_bin :	  Value reference include/dt-bindings/clock/ddr.h,
+- rockchip,dram_speed_bin :	  Value reference include/dt-bindings/clock/rk3399-ddr.h,
 				  it selects the DDR3 cl-trp-trcd type. It must be
 				  set according to "Speed Bin" in DDR3 datasheet,
 				  DO NOT use a smaller "Speed Bin" than specified
diff --git a/include/dt-bindings/clock/rk3399-ddr.h b/include/dt-bindings/clock/rk3399-ddr.h
new file mode 100644
index 000000000000..ed2280844963
--- /dev/null
+++ b/include/dt-bindings/clock/rk3399-ddr.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR MIT) */
+
+#ifndef DT_BINDINGS_DDR_H
+#define DT_BINDINGS_DDR_H
+
+/*
+ * DDR3 SDRAM Standard Speed Bins include tCK, tRCD, tRP, tRAS and tRC for
+ * each corresponding bin.
+ */
+
+/* DDR3-800 (5-5-5) */
+#define DDR3_800D	0
+/* DDR3-800 (6-6-6) */
+#define DDR3_800E	1
+/* DDR3-1066 (6-6-6) */
+#define DDR3_1066E	2
+/* DDR3-1066 (7-7-7) */
+#define DDR3_1066F	3
+/* DDR3-1066 (8-8-8) */
+#define DDR3_1066G	4
+/* DDR3-1333 (7-7-7) */
+#define DDR3_1333F	5
+/* DDR3-1333 (8-8-8) */
+#define DDR3_1333G	6
+/* DDR3-1333 (9-9-9) */
+#define DDR3_1333H	7
+/* DDR3-1333 (10-10-10) */
+#define DDR3_1333J 	8
+/* DDR3-1600 (8-8-8) */
+#define DDR3_1600G	9
+/* DDR3-1600 (9-9-9) */
+#define DDR3_1600H	10
+/* DDR3-1600 (10-10-10) */
+#define DDR3_1600J	11
+/* DDR3-1600 (11-11-11) */
+#define DDR3_1600K	12
+/* DDR3-1600 (10-10-10) */
+#define DDR3_1866J	13
+/* DDR3-1866 (11-11-11) */
+#define DDR3_1866K	14
+/* DDR3-1866 (12-12-12) */
+#define DDR3_1866L	15
+/* DDR3-1866 (13-13-13) */
+#define DDR3_1866M	16
+/* DDR3-2133 (11-11-11) */
+#define DDR3_2133K	17
+/* DDR3-2133 (12-12-12) */
+#define DDR3_2133L	18
+/* DDR3-2133 (13-13-13) */
+#define DDR3_2133M	19
+/* DDR3-2133 (14-14-14) */
+#define DDR3_2133N	20
+/* DDR3 ATF default */
+#define DDR3_DEFAULT	21
+
+#endif

From 90dd72e1290dd86c4b6e5c421fcd13e60e625782 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Wed, 9 May 2018 14:57:45 +0200
Subject: [PATCH 17/35] PM / devfreq: rk3399_dmc: remove wait for dcf irq
 event.

We have already wait dcf done in ATF, so don't need wait dcf irq
in kernel, besides, clear dcf irq in kernel will import competiton
between kernel and ATF, only handle dcf irq in ATF is a better way.

Signed-off-by: Lin Huang <hl@rock-chips.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/rk3399_dmc.c | 53 +-----------------------------------
 1 file changed, 1 insertion(+), 52 deletions(-)

diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index 5dfbfa3cc878..44a379657cd5 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -68,15 +68,6 @@ struct rk3399_dmcfreq {
 	struct devfreq_event_dev *edev;
 	struct mutex lock;
 	struct dram_timing timing;
-
-	/*
-	 * DDR Converser of Frequency (DCF) is used to implement DDR frequency
-	 * conversion without the participation of CPU, we will implement and
-	 * control it in arm trust firmware.
-	 */
-	wait_queue_head_t	wait_dcf_queue;
-	int irq;
-	int wait_dcf_flag;
 	struct regulator *vdd_center;
 	unsigned long rate, target_rate;
 	unsigned long volt, target_volt;
@@ -117,7 +108,6 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
 			goto out;
 		}
 	}
-	dmcfreq->wait_dcf_flag = 1;
 
 	err = clk_set_rate(dmcfreq->dmc_clk, target_rate);
 	if (err) {
@@ -128,14 +118,6 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
 		goto out;
 	}
 
-	/*
-	 * Wait until bcf irq happen, it means freq scaling finish in
-	 * arm trust firmware, use 100ms as timeout time.
-	 */
-	if (!wait_event_timeout(dmcfreq->wait_dcf_queue,
-				!dmcfreq->wait_dcf_flag, HZ / 10))
-		dev_warn(dev, "Timeout waiting for dcf interrupt\n");
-
 	/*
 	 * Check the dpll rate,
 	 * There only two result we will get,
@@ -241,22 +223,6 @@ static __maybe_unused int rk3399_dmcfreq_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(rk3399_dmcfreq_pm, rk3399_dmcfreq_suspend,
 			 rk3399_dmcfreq_resume);
 
-static irqreturn_t rk3399_dmc_irq(int irq, void *dev_id)
-{
-	struct rk3399_dmcfreq *dmcfreq = dev_id;
-	struct arm_smccc_res res;
-
-	dmcfreq->wait_dcf_flag = 0;
-	wake_up(&dmcfreq->wait_dcf_queue);
-
-	/* Clear the DCF interrupt */
-	arm_smccc_smc(ROCKCHIP_SIP_DRAM_FREQ, 0, 0,
-		      ROCKCHIP_SIP_CONFIG_DRAM_CLR_IRQ,
-		      0, 0, 0, 0, &res);
-
-	return IRQ_HANDLED;
-}
-
 static int of_get_ddr_timings(struct dram_timing *timing,
 			      struct device_node *np)
 {
@@ -330,16 +296,10 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct device_node *np = pdev->dev.of_node;
 	struct rk3399_dmcfreq *data;
-	int ret, irq, index, size;
+	int ret, index, size;
 	uint32_t *timing;
 	struct dev_pm_opp *opp;
 
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0) {
-		dev_err(&pdev->dev,
-			"Cannot get the dmc interrupt resource: %d\n", irq);
-		return irq;
-	}
 	data = devm_kzalloc(dev, sizeof(struct rk3399_dmcfreq), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
@@ -358,17 +318,6 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
 		return PTR_ERR(data->dmc_clk);
 	};
 
-	data->irq = irq;
-	ret = devm_request_irq(dev, irq, rk3399_dmc_irq, 0,
-			       dev_name(dev), data);
-	if (ret) {
-		dev_err(dev, "Failed to request dmc irq: %d\n", ret);
-		return ret;
-	}
-
-	init_waitqueue_head(&data->wait_dcf_queue);
-	data->wait_dcf_flag = 0;
-
 	data->edev = devfreq_event_get_edev_by_phandle(dev, 0);
 	if (IS_ERR(data->edev))
 		return -EPROBE_DEFER;

From 77f2c2305decda26885f8b2307577267b78c468f Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Wed, 9 May 2018 14:57:46 +0200
Subject: [PATCH 18/35] dt-bindings: devfreq: rk3399_dmc: move interrupts to be
 optional.

In ATF we already wait for DDR DVFS finish, so move the interrupts properties
to be optional.

Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 .../devicetree/bindings/devfreq/rk3399_dmc.txt         | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt b/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
index e0030a139383..0ec68141f85a 100644
--- a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
+++ b/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
@@ -5,10 +5,6 @@ Required properties:
 - devfreq-events:	 Node to get DDR loading, Refer to
 			 Documentation/devicetree/bindings/devfreq/event/
 			 rockchip-dfi.txt
-- interrupts:		 The CPU interrupt number. The interrupt specifier
-			 format depends on the interrupt controller.
-			 It should be a DCF interrupt. When DDR DVFS finishes
-			 a DCF interrupt is triggered.
 - clocks:		 Phandles for clock specified in "clock-names" property
 - clock-names :		 The name of clock used by the DFI, must be
 			 "pclk_ddr_mon";
@@ -17,6 +13,12 @@ Required properties:
 - center-supply:	 DMC supply node.
 - status:		 Marks the node enabled/disabled.
 
+Optional properties:
+- interrupts:		 The CPU interrupt number. The interrupt specifier
+			 format depends on the interrupt controller.
+			 It should be a DCF interrupt. When DDR DVFS finishes
+			 a DCF interrupt is triggered.
+
 Following properties relate to DDR timing:
 
 - rockchip,dram_speed_bin :	  Value reference include/dt-bindings/clock/rk3399-ddr.h,

From 49edc52312c34c981722833b0d9344c2aa83892d Mon Sep 17 00:00:00 2001
From: Lin Huang <hl@rock-chips.com>
Date: Wed, 9 May 2018 14:57:47 +0200
Subject: [PATCH 19/35] PM / devfreq: rk3399_dmc: do not print error when get
 supply and clk defer.

We just return -EPROBE_DEFER error code to caller and do not
print error message when try to get center logic regulator
and DMC clock defer.

Signed-off-by: Lin Huang <hl@rock-chips.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/rk3399_dmc.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index 44a379657cd5..5bfca028eaaf 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -308,12 +308,18 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
 
 	data->vdd_center = devm_regulator_get(dev, "center");
 	if (IS_ERR(data->vdd_center)) {
+		if (PTR_ERR(data->vdd_center) == -EPROBE_DEFER)
+			return -EPROBE_DEFER;
+
 		dev_err(dev, "Cannot get the regulator \"center\"\n");
 		return PTR_ERR(data->vdd_center);
 	}
 
 	data->dmc_clk = devm_clk_get(dev, "dmc_clk");
 	if (IS_ERR(data->dmc_clk)) {
+		if (PTR_ERR(data->dmc_clk) == -EPROBE_DEFER)
+			return -EPROBE_DEFER;
+
 		dev_err(dev, "Cannot get the clk dmc_clk\n");
 		return PTR_ERR(data->dmc_clk);
 	};

From dfa7d764caf00b12da276ea473d7f1fd7fd40200 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Wed, 9 May 2018 14:57:48 +0200
Subject: [PATCH 20/35] PM / devfreq: rk3399_dmc: fix spelling mistakes.

Fix some spelling mistakes in error and debug messages.

Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/rk3399_dmc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index 5bfca028eaaf..d5c03e5abe13 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -103,7 +103,7 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
 		err = regulator_set_voltage(dmcfreq->vdd_center, target_volt,
 					    target_volt);
 		if (err) {
-			dev_err(dev, "Cannot to set voltage %lu uV\n",
+			dev_err(dev, "Cannot set voltage %lu uV\n",
 				target_volt);
 			goto out;
 		}
@@ -111,8 +111,8 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
 
 	err = clk_set_rate(dmcfreq->dmc_clk, target_rate);
 	if (err) {
-		dev_err(dev, "Cannot to set frequency %lu (%d)\n",
-			target_rate, err);
+		dev_err(dev, "Cannot set frequency %lu (%d)\n", target_rate,
+			err);
 		regulator_set_voltage(dmcfreq->vdd_center, dmcfreq->volt,
 				      dmcfreq->volt);
 		goto out;
@@ -128,8 +128,8 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
 
 	/* If get the incorrect rate, set voltage to old value. */
 	if (dmcfreq->rate != target_rate) {
-		dev_err(dev, "Get wrong ddr frequency, Request frequency %lu,\
-			Current frequency %lu\n", target_rate, dmcfreq->rate);
+		dev_err(dev, "Got wrong frequency, Request %lu, Current %lu\n",
+			target_rate, dmcfreq->rate);
 		regulator_set_voltage(dmcfreq->vdd_center, dmcfreq->volt,
 				      dmcfreq->volt);
 		goto out;
@@ -137,7 +137,7 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
 		err = regulator_set_voltage(dmcfreq->vdd_center, target_volt,
 					    target_volt);
 	if (err)
-		dev_err(dev, "Cannot to set vol %lu uV\n", target_volt);
+		dev_err(dev, "Cannot set voltage %lu uV\n", target_volt);
 
 	dmcfreq->rate = target_rate;
 	dmcfreq->volt = target_volt;

From 2c2cb1e6b05b90d55b4b943646faa3cfbdf78f6e Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Fri, 25 May 2018 13:30:33 -0700
Subject: [PATCH 21/35] PM / devfreq: Init user limits from OPP limits, not
 viceversa

Commit ab8f58ad72c4 ("PM / devfreq: Set min/max_freq when adding
the devfreq device") introduced the initialization of the user
limits min/max_freq from the lowest/highest available OPPs. Later
commit f1d981eaecf8 ("PM / devfreq: Use the available min/max
frequency") added scaling_min/max_freq, which actually represent
the frequencies of the lowest/highest available OPP. scaling_min/
max_freq are initialized with the values from min/max_freq, which
is totally correct in the context, but a bit awkward to read.

Swap the initialization and assign scaling_min/max_freq with the
OPP freqs and then the user limts min/max_freq with scaling_min/
max_freq.

Needless to say that this change is a NOP, intended to improve
readability.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Reviewed-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/devfreq.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index e26adf67e218..4c49bb1330b5 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -604,21 +604,21 @@ struct devfreq *devfreq_add_device(struct device *dev,
 		mutex_lock(&devfreq->lock);
 	}
 
-	devfreq->min_freq = find_available_min_freq(devfreq);
-	if (!devfreq->min_freq) {
+	devfreq->scaling_min_freq = find_available_min_freq(devfreq);
+	if (!devfreq->scaling_min_freq) {
 		mutex_unlock(&devfreq->lock);
 		err = -EINVAL;
 		goto err_dev;
 	}
-	devfreq->scaling_min_freq = devfreq->min_freq;
+	devfreq->min_freq = devfreq->scaling_min_freq;
 
-	devfreq->max_freq = find_available_max_freq(devfreq);
-	if (!devfreq->max_freq) {
+	devfreq->scaling_max_freq = find_available_max_freq(devfreq);
+	if (!devfreq->scaling_max_freq) {
 		mutex_unlock(&devfreq->lock);
 		err = -EINVAL;
 		goto err_dev;
 	}
-	devfreq->scaling_max_freq = devfreq->max_freq;
+	devfreq->max_freq = devfreq->scaling_max_freq;
 
 	dev_set_name(&devfreq->dev, "devfreq%d",
 				atomic_inc_return(&devfreq_no));

From d6e98f3e6d825380b566dc59359917a116090154 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Fri, 15 Jun 2018 17:12:17 +0200
Subject: [PATCH 22/35] PM / devfreq: rk3399_dmc: Fix duplicated opp table on
 reload.

The opp table is not removed when the driver is unloaded neither when
there is an error within probe, so if the driver is reloaded the opp
core shows the following warning:

  rk3399-dmc-freq dmc: _opp_add: duplicate OPPs detected. Existing: freq:
               200000000, volt: 900000, enabled: 1. New: freq: 200000000,
               volt: 900000, enabled: 1
  rk3399-dmc-freq dmc: _opp_add: duplicate OPPs detected. Existing: freq:
               400000000, volt: 900000, enabled: 1. New: freq: 400000000,
               volt: 900000, enabled: 1
  rk3399-dmc-freq dmc: _opp_add: duplicate OPPs detected. Existing: freq:
               666000000, volt: 900000, enabled: 1. New: freq: 666000000,
               volt: 900000, enabled: 1
  rk3399-dmc-freq dmc: _opp_add: duplicate OPPs detected. Existing: freq:
               800000000, volt: 900000, enabled: 1. New: freq: 800000000,
               volt: 900000, enabled: 1
  rk3399-dmc-freq dmc: _opp_add: duplicate OPPs detected. Existing: freq:
               928000000, volt: 900000, enabled: 1. New: freq: 928000000,
               volt: 900000, enabled: 1

This patch fixes the error path in the probe function and adds a .remove
function to properly cleanup the opp table on unloading.

Fixes: 5a893e31a636c (PM / devfreq: rockchip: add devfreq driver for rk3399 dmc)
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/rk3399_dmc.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index d5c03e5abe13..e795ad2b3f6b 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -375,8 +375,10 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
 	data->rate = clk_get_rate(data->dmc_clk);
 
 	opp = devfreq_recommended_opp(dev, &data->rate, 0);
-	if (IS_ERR(opp))
-		return PTR_ERR(opp);
+	if (IS_ERR(opp)) {
+		ret = PTR_ERR(opp);
+		goto err_free_opp;
+	}
 
 	data->rate = dev_pm_opp_get_freq(opp);
 	data->volt = dev_pm_opp_get_voltage(opp);
@@ -388,13 +390,33 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
 					   &rk3399_devfreq_dmc_profile,
 					   DEVFREQ_GOV_SIMPLE_ONDEMAND,
 					   &data->ondemand_data);
-	if (IS_ERR(data->devfreq))
-		return PTR_ERR(data->devfreq);
+	if (IS_ERR(data->devfreq)) {
+		ret = PTR_ERR(data->devfreq);
+		goto err_free_opp;
+	}
+
 	devm_devfreq_register_opp_notifier(dev, data->devfreq);
 
 	data->dev = dev;
 	platform_set_drvdata(pdev, data);
 
+	return 0;
+
+err_free_opp:
+	dev_pm_opp_of_remove_table(&pdev->dev);
+	return ret;
+}
+
+static int rk3399_dmcfreq_remove(struct platform_device *pdev)
+{
+	struct rk3399_dmcfreq *dmcfreq = dev_get_drvdata(&pdev->dev);
+
+	/*
+	 * Before remove the opp table we need to unregister the opp notifier.
+	 */
+	devm_devfreq_unregister_opp_notifier(dmcfreq->dev, dmcfreq->devfreq);
+	dev_pm_opp_of_remove_table(dmcfreq->dev);
+
 	return 0;
 }
 
@@ -406,6 +428,7 @@ MODULE_DEVICE_TABLE(of, rk3399dmc_devfreq_of_match);
 
 static struct platform_driver rk3399_dmcfreq_driver = {
 	.probe	= rk3399_dmcfreq_probe,
+	.remove = rk3399_dmcfreq_remove,
 	.driver = {
 		.name	= "rk3399-dmc-freq",
 		.pm	= &rk3399_dmcfreq_pm,

From 2851b907091f2a04d19f50a5fc6a3aeed9e88dd3 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@bootlin.com>
Date: Wed, 4 Jul 2018 15:25:24 +0200
Subject: [PATCH 23/35] dt-bindings: marvell: Add documentation for the Armada
 3700 AVS binding

Extend the documentation of the Armada 37xx SoC with the Adaptive Voltage
Scaling (AVS) registers.

Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../bindings/arm/marvell/armada-37xx.txt          | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt b/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt
index 35c3c3460d17..eddde4faef01 100644
--- a/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt
+++ b/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt
@@ -33,3 +33,18 @@ nb_pm: syscon@14000 {
 	compatible = "marvell,armada-3700-nb-pm", "syscon";
 	reg = <0x14000 0x60>;
 }
+
+AVS
+---
+
+For AVS an other component is needed:
+
+Required properties:
+- compatible     : should contain "marvell,armada-3700-avs", "syscon";
+- reg            : the register start and length for the AVS
+
+Example:
+avs: avs@11500 {
+	compatible = "marvell,armada-3700-avs", "syscon";
+	reg = <0x11500 0x40>;
+}

From 1c3528232f4ba608cc2c31c7a8a55e0dbd6cb200 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@bootlin.com>
Date: Tue, 19 Jun 2018 14:44:01 +0200
Subject: [PATCH 24/35] cpufreq: armada-37xx: Add AVS support

Armada 37xx supports Adaptive Voltage Scaling and thanks to this patch a
voltage is associated to each load level.

Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/armada-37xx-cpufreq.c | 163 +++++++++++++++++++++++++-
 1 file changed, 160 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/armada-37xx-cpufreq.c b/drivers/cpufreq/armada-37xx-cpufreq.c
index 739da90ff3f6..75491fc841a6 100644
--- a/drivers/cpufreq/armada-37xx-cpufreq.c
+++ b/drivers/cpufreq/armada-37xx-cpufreq.c
@@ -51,6 +51,16 @@
 #define  ARMADA_37XX_DVFS_LOAD_2	2
 #define  ARMADA_37XX_DVFS_LOAD_3	3
 
+/* AVS register set */
+#define ARMADA_37XX_AVS_CTL0		0x0
+#define	 ARMADA_37XX_AVS_ENABLE		BIT(30)
+#define	 ARMADA_37XX_AVS_HIGH_VDD_LIMIT	16
+#define	 ARMADA_37XX_AVS_LOW_VDD_LIMIT	22
+#define	 ARMADA_37XX_AVS_VDD_MASK	0x3F
+#define ARMADA_37XX_AVS_CTL2		0x8
+#define	 ARMADA_37XX_AVS_LOW_VDD_EN	BIT(6)
+#define ARMADA_37XX_AVS_VSET(x)	    (0x1C + 4 * (x))
+
 /*
  * On Armada 37xx the Power management manages 4 level of CPU load,
  * each level can be associated with a CPU clock source, a CPU
@@ -58,6 +68,17 @@
  */
 #define LOAD_LEVEL_NR	4
 
+#define MIN_VOLT_MV 1000
+
+/*  AVS value for the corresponding voltage (in mV) */
+static int avs_map[] = {
+	747, 758, 770, 782, 793, 805, 817, 828, 840, 852, 863, 875, 887, 898,
+	910, 922, 933, 945, 957, 968, 980, 992, 1003, 1015, 1027, 1038, 1050,
+	1062, 1073, 1085, 1097, 1108, 1120, 1132, 1143, 1155, 1167, 1178, 1190,
+	1202, 1213, 1225, 1237, 1248, 1260, 1272, 1283, 1295, 1307, 1318, 1330,
+	1342
+};
+
 struct armada37xx_cpufreq_state {
 	struct regmap *regmap;
 	u32 nb_l0l1;
@@ -71,6 +92,7 @@ static struct armada37xx_cpufreq_state *armada37xx_cpufreq_state;
 struct armada_37xx_dvfs {
 	u32 cpu_freq_max;
 	u8 divider[LOAD_LEVEL_NR];
+	u32 avs[LOAD_LEVEL_NR];
 };
 
 static struct armada_37xx_dvfs armada_37xx_dvfs[] = {
@@ -148,6 +170,128 @@ static void __init armada37xx_cpufreq_dvfs_setup(struct regmap *base,
 	clk_set_parent(clk, parent);
 }
 
+/*
+ * Find out the armada 37x supported AVS value whose voltage value is
+ * the round-up closest to the target voltage value.
+ */
+static u32 armada_37xx_avs_val_match(int target_vm)
+{
+	u32 avs;
+
+	/* Find out the round-up closest supported voltage value */
+	for (avs = 0; avs < ARRAY_SIZE(avs_map); avs++)
+		if (avs_map[avs] >= target_vm)
+			break;
+
+	/*
+	 * If all supported voltages are smaller than target one,
+	 * choose the largest supported voltage
+	 */
+	if (avs == ARRAY_SIZE(avs_map))
+		avs = ARRAY_SIZE(avs_map) - 1;
+
+	return avs;
+}
+
+/*
+ * For Armada 37xx soc, L0(VSET0) VDD AVS value is set to SVC revision
+ * value or a default value when SVC is not supported.
+ * - L0 can be read out from the register of AVS_CTRL_0 and L0 voltage
+ *   can be got from the mapping table of avs_map.
+ * - L1 voltage should be about 100mv smaller than L0 voltage
+ * - L2 & L3 voltage should be about 150mv smaller than L0 voltage.
+ * This function calculates L1 & L2 & L3 AVS values dynamically based
+ * on L0 voltage and fill all AVS values to the AVS value table.
+ */
+static void __init armada37xx_cpufreq_avs_configure(struct regmap *base,
+						struct armada_37xx_dvfs *dvfs)
+{
+	unsigned int target_vm;
+	int load_level = 0;
+	u32 l0_vdd_min;
+
+	if (base == NULL)
+		return;
+
+	/* Get L0 VDD min value */
+	regmap_read(base, ARMADA_37XX_AVS_CTL0, &l0_vdd_min);
+	l0_vdd_min = (l0_vdd_min >> ARMADA_37XX_AVS_LOW_VDD_LIMIT) &
+		ARMADA_37XX_AVS_VDD_MASK;
+	if (l0_vdd_min >= ARRAY_SIZE(avs_map))  {
+		pr_err("L0 VDD MIN %d is not correct.\n", l0_vdd_min);
+		return;
+	}
+	dvfs->avs[0] = l0_vdd_min;
+
+	if (avs_map[l0_vdd_min] <= MIN_VOLT_MV) {
+		/*
+		 * If L0 voltage is smaller than 1000mv, then all VDD sets
+		 * use L0 voltage;
+		 */
+		u32 avs_min = armada_37xx_avs_val_match(MIN_VOLT_MV);
+
+		for (load_level = 1; load_level < LOAD_LEVEL_NR; load_level++)
+			dvfs->avs[load_level] = avs_min;
+
+		return;
+	}
+
+	/*
+	 * L1 voltage is equal to L0 voltage - 100mv and it must be
+	 * larger than 1000mv
+	 */
+
+	target_vm = avs_map[l0_vdd_min] - 100;
+	target_vm = target_vm > MIN_VOLT_MV ? target_vm : MIN_VOLT_MV;
+	dvfs->avs[1] = armada_37xx_avs_val_match(target_vm);
+
+	/*
+	 * L2 & L3 voltage is equal to L0 voltage - 150mv and it must
+	 * be larger than 1000mv
+	 */
+	target_vm = avs_map[l0_vdd_min] - 150;
+	target_vm = target_vm > MIN_VOLT_MV ? target_vm : MIN_VOLT_MV;
+	dvfs->avs[2] = dvfs->avs[3] = armada_37xx_avs_val_match(target_vm);
+}
+
+static void __init armada37xx_cpufreq_avs_setup(struct regmap *base,
+						struct armada_37xx_dvfs *dvfs)
+{
+	unsigned int avs_val = 0, freq;
+	int load_level = 0;
+
+	if (base == NULL)
+		return;
+
+	/* Disable AVS before the configuration */
+	regmap_update_bits(base, ARMADA_37XX_AVS_CTL0,
+			   ARMADA_37XX_AVS_ENABLE, 0);
+
+
+	/* Enable low voltage mode */
+	regmap_update_bits(base, ARMADA_37XX_AVS_CTL2,
+			   ARMADA_37XX_AVS_LOW_VDD_EN,
+			   ARMADA_37XX_AVS_LOW_VDD_EN);
+
+
+	for (load_level = 1; load_level < LOAD_LEVEL_NR; load_level++) {
+		freq = dvfs->cpu_freq_max / dvfs->divider[load_level];
+
+		avs_val = dvfs->avs[load_level];
+		regmap_update_bits(base, ARMADA_37XX_AVS_VSET(load_level-1),
+		    ARMADA_37XX_AVS_VDD_MASK << ARMADA_37XX_AVS_HIGH_VDD_LIMIT |
+		    ARMADA_37XX_AVS_VDD_MASK << ARMADA_37XX_AVS_LOW_VDD_LIMIT,
+		    avs_val << ARMADA_37XX_AVS_HIGH_VDD_LIMIT |
+		    avs_val << ARMADA_37XX_AVS_LOW_VDD_LIMIT);
+	}
+
+	/* Enable AVS after the configuration */
+	regmap_update_bits(base, ARMADA_37XX_AVS_CTL0,
+			   ARMADA_37XX_AVS_ENABLE,
+			   ARMADA_37XX_AVS_ENABLE);
+
+}
+
 static void armada37xx_cpufreq_disable_dvfs(struct regmap *base)
 {
 	unsigned int reg = ARMADA_37XX_NB_DYN_MOD,
@@ -216,7 +360,7 @@ static int __init armada37xx_cpufreq_driver_init(void)
 	struct platform_device *pdev;
 	unsigned long freq;
 	unsigned int cur_frequency;
-	struct regmap *nb_pm_base;
+	struct regmap *nb_pm_base, *avs_base;
 	struct device *cpu_dev;
 	int load_lvl, ret;
 	struct clk *clk;
@@ -227,6 +371,14 @@ static int __init armada37xx_cpufreq_driver_init(void)
 	if (IS_ERR(nb_pm_base))
 		return -ENODEV;
 
+	avs_base =
+		syscon_regmap_lookup_by_compatible("marvell,armada-3700-avs");
+
+	/* if AVS is not present don't use it but still try to setup dvfs */
+	if (IS_ERR(avs_base)) {
+		pr_info("Syscon failed for Adapting Voltage Scaling: skip it\n");
+		avs_base = NULL;
+	}
 	/* Before doing any configuration on the DVFS first, disable it */
 	armada37xx_cpufreq_disable_dvfs(nb_pm_base);
 
@@ -270,16 +422,21 @@ static int __init armada37xx_cpufreq_driver_init(void)
 
 	armada37xx_cpufreq_state->regmap = nb_pm_base;
 
+	armada37xx_cpufreq_avs_configure(avs_base, dvfs);
+	armada37xx_cpufreq_avs_setup(avs_base, dvfs);
+
 	armada37xx_cpufreq_dvfs_setup(nb_pm_base, clk, dvfs->divider);
 	clk_put(clk);
 
 	for (load_lvl = ARMADA_37XX_DVFS_LOAD_0; load_lvl < LOAD_LEVEL_NR;
 	     load_lvl++) {
+		unsigned long u_volt = avs_map[dvfs->avs[load_lvl]] * 1000;
 		freq = cur_frequency / dvfs->divider[load_lvl];
-
-		ret = dev_pm_opp_add(cpu_dev, freq, 0);
+		ret = dev_pm_opp_add(cpu_dev, freq, u_volt);
 		if (ret)
 			goto remove_opp;
+
+
 	}
 
 	/* Now that everything is setup, enable the DVFS at hardware level */

From 33477d84c26bbfa626da2c032e567a90dd70a528 Mon Sep 17 00:00:00 2001
From: George Cherian <george.cherian@cavium.com>
Date: Wed, 11 Jul 2018 23:07:55 -0700
Subject: [PATCH 25/35] cpufreq / CPPC: Add cpuinfo_cur_freq support for CPPC

Per Section 8.4.7.1.3 of ACPI 6.2, the platform provides performance
feedback via set of performance counters. To determine the actual
performance level delivered over time, OSPM may read a set of
performance counters from the Reference Performance Counter Register
and the Delivered Performance Counter Register.

OSPM calculates the delivered performance over a given time period by
taking a beginning and ending snapshot of both the reference and
delivered performance counters, and calculating:

delivered_perf = reference_perf X (delta of delivered_perf counter / delta of reference_perf counter).

Implement the above and hook this up to the cpufreq->get method.

Signed-off-by: George Cherian <george.cherian@cavium.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Prashanth Prakash <pprakash@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cppc_cpufreq.c | 52 ++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index a9d3eec32795..30f302149730 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -296,10 +296,62 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	return ret;
 }
 
+static inline u64 get_delta(u64 t1, u64 t0)
+{
+	if (t1 > t0 || t0 > ~(u32)0)
+		return t1 - t0;
+
+	return (u32)t1 - (u32)t0;
+}
+
+static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu,
+				     struct cppc_perf_fb_ctrs fb_ctrs_t0,
+				     struct cppc_perf_fb_ctrs fb_ctrs_t1)
+{
+	u64 delta_reference, delta_delivered;
+	u64 reference_perf, delivered_perf;
+
+	reference_perf = fb_ctrs_t0.reference_perf;
+
+	delta_reference = get_delta(fb_ctrs_t1.reference,
+				    fb_ctrs_t0.reference);
+	delta_delivered = get_delta(fb_ctrs_t1.delivered,
+				    fb_ctrs_t0.delivered);
+
+	/* Check to avoid divide-by zero */
+	if (delta_reference || delta_delivered)
+		delivered_perf = (reference_perf * delta_delivered) /
+					delta_reference;
+	else
+		delivered_perf = cpu->perf_ctrls.desired_perf;
+
+	return cppc_cpufreq_perf_to_khz(cpu, delivered_perf);
+}
+
+static unsigned int cppc_cpufreq_get_rate(unsigned int cpunum)
+{
+	struct cppc_perf_fb_ctrs fb_ctrs_t0 = {0}, fb_ctrs_t1 = {0};
+	struct cppc_cpudata *cpu = all_cpu_data[cpunum];
+	int ret;
+
+	ret = cppc_get_perf_ctrs(cpunum, &fb_ctrs_t0);
+	if (ret)
+		return ret;
+
+	udelay(2); /* 2usec delay between sampling */
+
+	ret = cppc_get_perf_ctrs(cpunum, &fb_ctrs_t1);
+	if (ret)
+		return ret;
+
+	return cppc_get_rate_from_fbctrs(cpu, fb_ctrs_t0, fb_ctrs_t1);
+}
+
 static struct cpufreq_driver cppc_cpufreq_driver = {
 	.flags = CPUFREQ_CONST_LOOPS,
 	.verify = cppc_verify_policy,
 	.target = cppc_cpufreq_set_target,
+	.get = cppc_cpufreq_get_rate,
 	.init = cppc_cpufreq_cpu_init,
 	.stop_cpu = cppc_cpufreq_stop_cpu,
 	.name = "cppc_cpufreq",

From f54ab690ad68e4c5a5b4d4b5dbb28a35018546c5 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@linaro.org>
Date: Tue, 17 Jul 2018 22:48:21 +0200
Subject: [PATCH 26/35] cpufreq: qcom-kryo: Silently error out on EPROBE_DEFER

If of_nvmem_cell_get() fails due to probe deferal, we shouldn't print an
error message. Just be silent in this case.

Signed-off-by: Niklas Cassel <niklas.cassel@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/qcom-cpufreq-kryo.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/qcom-cpufreq-kryo.c b/drivers/cpufreq/qcom-cpufreq-kryo.c
index 29389accf3e9..b8d1e6875f16 100644
--- a/drivers/cpufreq/qcom-cpufreq-kryo.c
+++ b/drivers/cpufreq/qcom-cpufreq-kryo.c
@@ -109,8 +109,9 @@ static int qcom_cpufreq_kryo_probe(struct platform_device *pdev)
 	speedbin_nvmem = of_nvmem_cell_get(np, NULL);
 	of_node_put(np);
 	if (IS_ERR(speedbin_nvmem)) {
-		dev_err(cpu_dev, "Could not get nvmem cell: %ld\n",
-			PTR_ERR(speedbin_nvmem));
+		if (PTR_ERR(speedbin_nvmem) != -EPROBE_DEFER)
+			dev_err(cpu_dev, "Could not get nvmem cell: %ld\n",
+				PTR_ERR(speedbin_nvmem));
 		return PTR_ERR(speedbin_nvmem);
 	}
 

From 4d81b0f9e631f751bf231213893e202a51f76687 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 18 Jul 2018 10:15:07 +0200
Subject: [PATCH 27/35] cpufreq: pcc-cpufreq: Disable dynamic scaling on
 many-CPU systems

The firmware interface used by the pcc-cpufreq driver is
fundamentally not scalable and using it for dynamic CPU performance
scaling on systems with many CPUs leads to degraded performance.

For this reason, disable dynamic CPU performance scaling on systems
with pcc-cpufreq where the number of CPUs present at the driver init
time is greater than 4.  Also make the driver print corresponding
complaints to the kernel log.

Reported-by: Andreas Herrmann <aherrmann@suse.com>
Tested-by: Andreas Herrmann <aherrmann@suse.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/pcc-cpufreq.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c
index 3f0ce2ae35ee..ef8d984178a5 100644
--- a/drivers/cpufreq/pcc-cpufreq.c
+++ b/drivers/cpufreq/pcc-cpufreq.c
@@ -589,6 +589,15 @@ static int __init pcc_cpufreq_init(void)
 		return ret;
 	}
 
+	if (num_present_cpus() > 4) {
+		pcc_cpufreq_driver.flags |= CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING;
+		pr_err("%s: Too many CPUs, dynamic performance scaling disabled\n",
+		       __func__);
+		pr_err("%s: Try to enable another scaling driver through BIOS settings\n",
+		       __func__);
+		pr_err("%s: and complain to the system vendor\n", __func__);
+	}
+
 	ret = cpufreq_register_driver(&pcc_cpufreq_driver);
 
 	return ret;

From eea033d07543a177fc2ab35a6d633b2aa9684b0f Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 18 Jul 2018 14:51:59 -0700
Subject: [PATCH 28/35] cpufreq: intel_pstate: Show different max frequency
 with turbo 3 and HWP

On HWP platforms with Turbo 3.0, the HWP capability max ratio shows the
maximum ratio of that core, which can be different than other cores. If
we show the correct maximum frequency in cpufreq sysfs via
cpuinfo_max_freq and scaling_max_freq then, user can know which cores
can run faster for pinning some high priority tasks.

Currently the max turbo frequency is shown as max frequency, which is
the max of all cores, even if some cores can't reach that frequency
even for single threaded workload.

But it is possible that max ratio in HWP capabilities is set as 0xFF or
some high invalid value (E.g. One KBL NUC). Since the actual performance
can never exceed 1 core turbo frequency from MSR TURBO_RATIO_LIMIT, we
use this as a bound check.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index a5c368425e36..2584dd00e3c9 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -2069,6 +2069,15 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
 			cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
 	policy->cpuinfo.max_freq *= cpu->pstate.scaling;
 
+	if (hwp_active) {
+		unsigned int max_freq;
+
+		max_freq = global.turbo_disabled ?
+			cpu->pstate.max_freq : cpu->pstate.turbo_freq;
+		if (max_freq < policy->cpuinfo.max_freq)
+			policy->cpuinfo.max_freq = max_freq;
+	}
+
 	intel_pstate_init_acpi_perf_limits(policy);
 
 	policy->fast_switch_possible = true;

From 601b218568a107370086dc5c7a1b283f8d463268 Mon Sep 17 00:00:00 2001
From: Ruchi Kandoi <kandoiruchi@google.com>
Date: Tue, 24 Jul 2018 10:35:44 -0700
Subject: [PATCH 29/35] cpufreq: trace frequency limits change

systrace used for tracing for Android systems has carried a patch for
many years in the Android tree that traces when the cpufreq limits
change.  With the help of this information, systrace can know when the
policy limits change and can visually display the data. Lets add
upstream support for the same.

Signed-off-by: Ruchi Kandoi <kandoiruchi@google.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/trace/events-power.rst |  1 +
 drivers/cpufreq/cpufreq.c            |  1 +
 include/trace/events/power.h         | 25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/Documentation/trace/events-power.rst b/Documentation/trace/events-power.rst
index a77daca75e30..2ef318962e29 100644
--- a/Documentation/trace/events-power.rst
+++ b/Documentation/trace/events-power.rst
@@ -27,6 +27,7 @@ cpufreq.
 
   cpu_idle		"state=%lu cpu_id=%lu"
   cpu_frequency		"state=%lu cpu_id=%lu"
+  cpu_frequency_limits	"min=%lu max=%lu cpu_id=%lu"
 
 A suspend event is used to indicate the system going in and out of the
 suspend mode:
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b0dfd3222013..52566f1f1050 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2236,6 +2236,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 
 	policy->min = new_policy->min;
 	policy->max = new_policy->max;
+	trace_cpu_frequency_limits(policy);
 
 	policy->cached_target_freq = UINT_MAX;
 
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 908977d69783..f7aece721aed 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -5,6 +5,7 @@
 #if !defined(_TRACE_POWER_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_POWER_H
 
+#include <linux/cpufreq.h>
 #include <linux/ktime.h>
 #include <linux/pm_qos.h>
 #include <linux/tracepoint.h>
@@ -148,6 +149,30 @@ DEFINE_EVENT(cpu, cpu_frequency,
 	TP_ARGS(frequency, cpu_id)
 );
 
+TRACE_EVENT(cpu_frequency_limits,
+
+	TP_PROTO(struct cpufreq_policy *policy),
+
+	TP_ARGS(policy),
+
+	TP_STRUCT__entry(
+		__field(u32, min_freq)
+		__field(u32, max_freq)
+		__field(u32, cpu_id)
+	),
+
+	TP_fast_assign(
+		__entry->min_freq = policy->min;
+		__entry->max_freq = policy->max;
+		__entry->cpu_id = policy->cpu;
+	),
+
+	TP_printk("min=%lu max=%lu cpu_id=%lu",
+		  (unsigned long)__entry->min_freq,
+		  (unsigned long)__entry->max_freq,
+		  (unsigned long)__entry->cpu_id)
+);
+
 TRACE_EVENT(device_pm_callback_start,
 
 	TP_PROTO(struct device *dev, const char *pm_ops, int event),

From c2cd0b08e1efd9ee58d09049a6c77e5efa0ef627 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 24 Jul 2018 09:49:28 -0700
Subject: [PATCH 30/35] x86/power/hibernate_64: Remove VLA usage

In the quest to remove all stack VLA usage from the kernel [1], this
removes the discouraged use of AHASH_REQUEST_ON_STACK by switching to
shash directly and allocating the descriptor in heap memory (which should
be fine: the tfm has already been allocated there too).

Link: https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com # [1]
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/power/hibernate_64.c | 36 ++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 67ccf64c8bd8..f8e3b668d20b 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -233,29 +233,35 @@ struct restore_data_record {
  */
 static int get_e820_md5(struct e820_table *table, void *buf)
 {
-	struct scatterlist sg;
-	struct crypto_ahash *tfm;
+	struct crypto_shash *tfm;
+	struct shash_desc *desc;
 	int size;
 	int ret = 0;
 
-	tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
+	tfm = crypto_alloc_shash("md5", 0, 0);
 	if (IS_ERR(tfm))
 		return -ENOMEM;
 
-	{
-		AHASH_REQUEST_ON_STACK(req, tfm);
-		size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry) * table->nr_entries;
-		ahash_request_set_tfm(req, tfm);
-		sg_init_one(&sg, (u8 *)table, size);
-		ahash_request_set_callback(req, 0, NULL, NULL);
-		ahash_request_set_crypt(req, &sg, buf, size);
-
-		if (crypto_ahash_digest(req))
-			ret = -EINVAL;
-		ahash_request_zero(req);
+	desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
+		       GFP_KERNEL);
+	if (!desc) {
+		ret = -ENOMEM;
+		goto free_tfm;
 	}
-	crypto_free_ahash(tfm);
 
+	desc->tfm = tfm;
+	desc->flags = 0;
+
+	size = offsetof(struct e820_table, entries) +
+		sizeof(struct e820_entry) * table->nr_entries;
+
+	if (crypto_shash_digest(desc, (u8 *)table, size, buf))
+		ret = -EINVAL;
+
+	kzfree(desc);
+
+free_tfm:
+	crypto_free_shash(tfm);
 	return ret;
 }
 

From 6f4ceee9305dc3fe74099159b460f4b56b506f1d Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 24 Jul 2018 14:26:04 -0400
Subject: [PATCH 31/35] cpu/hotplug: Add a cpus_read_trylock() function

There are use cases where it can be useful to have a cpus_read_trylock()
function to work around circular lock dependency problem involving
the cpu_hotplug_lock.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpu.h | 2 ++
 kernel/cpu.c        | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index a97a63eef59f..e850bfea3e84 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -103,6 +103,7 @@ extern void cpus_write_lock(void);
 extern void cpus_write_unlock(void);
 extern void cpus_read_lock(void);
 extern void cpus_read_unlock(void);
+extern int  cpus_read_trylock(void);
 extern void lockdep_assert_cpus_held(void);
 extern void cpu_hotplug_disable(void);
 extern void cpu_hotplug_enable(void);
@@ -115,6 +116,7 @@ static inline void cpus_write_lock(void) { }
 static inline void cpus_write_unlock(void) { }
 static inline void cpus_read_lock(void) { }
 static inline void cpus_read_unlock(void) { }
+static inline int  cpus_read_trylock(void) { return true; }
 static inline void lockdep_assert_cpus_held(void) { }
 static inline void cpu_hotplug_disable(void) { }
 static inline void cpu_hotplug_enable(void) { }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0db8938fbb23..307486baa477 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -290,6 +290,12 @@ void cpus_read_lock(void)
 }
 EXPORT_SYMBOL_GPL(cpus_read_lock);
 
+int cpus_read_trylock(void)
+{
+	return percpu_down_read_trylock(&cpu_hotplug_lock);
+}
+EXPORT_SYMBOL_GPL(cpus_read_trylock);
+
 void cpus_read_unlock(void)
 {
 	percpu_up_read(&cpu_hotplug_lock);

From 9b3d9bb3e4deef41095e513c2ffbebab20f9a982 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 24 Jul 2018 14:26:05 -0400
Subject: [PATCH 32/35] cpufreq: Fix a circular lock dependency problem

With lockdep turned on, the following circular lock dependency problem
was reported:

[   57.470040] ======================================================
[   57.502900] WARNING: possible circular locking dependency detected
[   57.535208] 4.18.0-0.rc3.1.el8+7.x86_64+debug #1 Tainted: G
[   57.577761] ------------------------------------------------------
[   57.609714] tuned/1505 is trying to acquire lock:
[   57.633808] 00000000559deec5 (cpu_hotplug_lock.rw_sem){++++}, at: store+0x27/0x120
[   57.672880]
[   57.672880] but task is already holding lock:
[   57.702184] 000000002136ca64 (kn->count#118){++++}, at: kernfs_fop_write+0x1d0/0x410
[   57.742176]
[   57.742176] which lock already depends on the new lock.
[   57.742176]
[   57.785220]
[   57.785220] the existing dependency chain (in reverse order) is:
    :
[   58.932512] other info that might help us debug this:
[   58.932512]
[   58.973344] Chain exists of:
[   58.973344]   cpu_hotplug_lock.rw_sem --> subsys mutex#5 --> kn->count#118
[   58.973344]
[   59.030795]  Possible unsafe locking scenario:
[   59.030795]
[   59.061248]        CPU0                    CPU1
[   59.085377]        ----                    ----
[   59.108160]   lock(kn->count#118);
[   59.124935]                                lock(subsys mutex#5);
[   59.156330]                                lock(kn->count#118);
[   59.186088]   lock(cpu_hotplug_lock.rw_sem);
[   59.208541]
[   59.208541]  *** DEADLOCK ***

In the cpufreq_register_driver() function, the lock sequence is:

  cpus_read_lock --> kn->count

For the cpufreq sysfs store method, the lock sequence is:

  kn->count --> cpus_read_lock

These sequences are actually safe as they are taking a share lock on
cpu_hotplug_lock. However, the current lockdep code doesn't check for
share locking when detecting circular lock dependency.  Fixing that
could be a substantial effort.

Instead, we can work around this problem by using cpus_read_trylock()
in the store method which is much simpler. The chance of not getting
the read lock is very small. If that happens, the userspace application
that writes the sysfs file will get an error.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 52566f1f1050..f53fb41efb7b 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -923,7 +923,12 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
 	struct freq_attr *fattr = to_attr(attr);
 	ssize_t ret = -EINVAL;
 
-	cpus_read_lock();
+	/*
+	 * cpus_read_trylock() is used here to work around a circular lock
+	 * dependency problem with respect to the cpufreq_register_driver().
+	 */
+	if (!cpus_read_trylock())
+		return -EBUSY;
 
 	if (cpu_online(policy->cpu)) {
 		down_write(&policy->rwsem);

From d3264f752a1aedac98aa90e50853df149d1346f2 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 1 Aug 2018 17:26:06 -0700
Subject: [PATCH 33/35] cpufreq: intel_pstate: Ignore turbo active ratio in HWP

When HWP is active turbo active ratio is not used, so we should allow
policy max frequency above turbo activation ratio to be set. When HWP is
not active, then any policy max frequency above turbo activation ratio
can result upto max one-core turbo frequency.

This fix helps better thermal control in turbo region when other methods
like "Running Average Power Limit" is not available to use.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 7bfb98380db9..b6a1aadaff9f 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -2008,7 +2008,8 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 static void intel_pstate_adjust_policy_max(struct cpufreq_policy *policy,
 					 struct cpudata *cpu)
 {
-	if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
+	if (!hwp_active &&
+	    cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
 	    policy->max < policy->cpuinfo.max_freq &&
 	    policy->max > cpu->pstate.max_freq) {
 		pr_debug("policy->max > max non turbo frequency\n");

From 82837ad5bda79cd13d64b9abad92e7872a06bc65 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Sat, 4 Aug 2018 21:12:00 -0500
Subject: [PATCH 34/35] PM / hibernate: Mark expected switch fall-through

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

This addresses Coverity-ID: 114713 ("Missing break in switch").

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 9c85c7822383..38f3f96b4b12 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -638,6 +638,7 @@ static void power_down(void)
 		break;
 	case HIBERNATION_PLATFORM:
 		hibernation_platform_enter();
+		/* Fall through */
 	case HIBERNATION_SHUTDOWN:
 		if (pm_power_off)
 			kernel_power_off();

From 55f2503c3b69328735e88031ff8d6ba291bd952b Mon Sep 17 00:00:00 2001
From: Pingfan Liu <kernelfans@gmail.com>
Date: Tue, 31 Jul 2018 16:51:32 +0800
Subject: [PATCH 35/35] PM / reboot: Eliminate race between reboot and suspend

At present, "systemctl suspend" and "shutdown" can run in parrallel. A
system can suspend after devices_shutdown(), and resume. Then the shutdown
task goes on to power off. This causes many devices are not really shut
off. Hence replacing reboot_mutex with system_transition_mutex (renamed
from pm_mutex) to achieve the exclusion. The renaming of pm_mutex as
system_transition_mutex can be better to reflect the purpose of the mutex.

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/freezing-of-tasks.txt      | 12 ++++++------
 Documentation/power/suspend-and-cpuhotplug.txt |  6 +++---
 include/linux/suspend.h                        |  2 +-
 kernel/freezer.c                               |  4 +++-
 kernel/power/hibernate.c                       | 15 ++++++++-------
 kernel/power/main.c                            | 12 ++++++------
 kernel/power/suspend.c                         |  4 ++--
 kernel/power/user.c                            |  4 ++--
 kernel/reboot.c                                |  6 +++---
 mm/page_alloc.c                                | 11 ++++++-----
 10 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
index af005770e767..cd283190855a 100644
--- a/Documentation/power/freezing-of-tasks.txt
+++ b/Documentation/power/freezing-of-tasks.txt
@@ -204,26 +204,26 @@ VI. Are there any precautions to be taken to prevent freezing failures?
 
 Yes, there are.
 
-First of all, grabbing the 'pm_mutex' lock to mutually exclude a piece of code
+First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
 from system-wide sleep such as suspend/hibernation is not encouraged.
 If possible, that piece of code must instead hook onto the suspend/hibernation
 notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
 (kernel/cpu.c) for an example.
 
-However, if that is not feasible, and grabbing 'pm_mutex' is deemed necessary,
-it is strongly discouraged to directly call mutex_[un]lock(&pm_mutex) since
+However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
+it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
 that could lead to freezing failures, because if the suspend/hibernate code
-successfully acquired the 'pm_mutex' lock, and hence that other entity failed
+successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
 to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
 state. As a consequence, the freezer would not be able to freeze that task,
 leading to freezing failure.
 
 However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
 since they ask the freezer to skip freezing this task, since it is anyway
-"frozen enough" as it is blocked on 'pm_mutex', which will be released
+"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
 only after the entire suspend/hibernation sequence is complete.
 So, to summarize, use [un]lock_system_sleep() instead of directly using
-mutex_[un]lock(&pm_mutex). That would prevent freezing failures.
+mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
 
 V. Miscellaneous
 /sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt
index 6f55eb960a6d..a8751b8df10e 100644
--- a/Documentation/power/suspend-and-cpuhotplug.txt
+++ b/Documentation/power/suspend-and-cpuhotplug.txt
@@ -32,7 +32,7 @@ More details follow:
                                     sysfs file
                                         |
                                         v
-                               Acquire pm_mutex lock
+                               Acquire system_transition_mutex lock
                                         |
                                         v
                              Send PM_SUSPEND_PREPARE
@@ -96,10 +96,10 @@ execution during resume):
 
 * thaw tasks
 * send PM_POST_SUSPEND notifications
-* Release pm_mutex lock.
+* Release system_transition_mutex lock.
 
 
-It is to be noted here that the pm_mutex lock is acquired at the very
+It is to be noted here that the system_transition_mutex lock is acquired at the very
 beginning, when we are just starting out to suspend, and then released only
 after the entire cycle is complete (i.e., suspend + resume).
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 440b62f7502e..5a28ac9284f0 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -414,7 +414,7 @@ static inline bool hibernation_available(void) { return false; }
 #define PM_RESTORE_PREPARE	0x0005 /* Going to restore a saved image */
 #define PM_POST_RESTORE		0x0006 /* Restore failed */
 
-extern struct mutex pm_mutex;
+extern struct mutex system_transition_mutex;
 
 #ifdef CONFIG_PM_SLEEP
 void save_processor_state(void);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 6f56a9e219fa..b162b74611e4 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -15,7 +15,9 @@
 atomic_t system_freezing_cnt = ATOMIC_INIT(0);
 EXPORT_SYMBOL(system_freezing_cnt);
 
-/* indicate whether PM freezing is in effect, protected by pm_mutex */
+/* indicate whether PM freezing is in effect, protected by
+ * system_transition_mutex
+ */
 bool pm_freezing;
 bool pm_nosig_freezing;
 
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 38f3f96b4b12..abef759de7c8 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -338,7 +338,7 @@ static int create_image(int platform_mode)
  * hibernation_snapshot - Quiesce devices and create a hibernation image.
  * @platform_mode: If set, use platform driver to prepare for the transition.
  *
- * This routine must be called with pm_mutex held.
+ * This routine must be called with system_transition_mutex held.
  */
 int hibernation_snapshot(int platform_mode)
 {
@@ -500,8 +500,9 @@ static int resume_target_kernel(bool platform_mode)
  * hibernation_restore - Quiesce devices and restore from a hibernation image.
  * @platform_mode: If set, use platform driver to prepare for the transition.
  *
- * This routine must be called with pm_mutex held.  If it is successful, control
- * reappears in the restored target kernel in hibernation_snapshot().
+ * This routine must be called with system_transition_mutex held.  If it is
+ * successful, control reappears in the restored target kernel in
+ * hibernation_snapshot().
  */
 int hibernation_restore(int platform_mode)
 {
@@ -806,13 +807,13 @@ static int software_resume(void)
 	 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
 	 * is configured into the kernel. Since the regular hibernate
 	 * trigger path is via sysfs which takes a buffer mutex before
-	 * calling hibernate functions (which take pm_mutex) this can
-	 * cause lockdep to complain about a possible ABBA deadlock
+	 * calling hibernate functions (which take system_transition_mutex)
+	 * this can cause lockdep to complain about a possible ABBA deadlock
 	 * which cannot happen since we're in the boot code here and
 	 * sysfs can't be invoked yet. Therefore, we use a subclass
 	 * here to avoid lockdep complaining.
 	 */
-	mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
+	mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING);
 
 	if (swsusp_resume_device)
 		goto Check_image;
@@ -900,7 +901,7 @@ static int software_resume(void)
 	atomic_inc(&snapshot_device_available);
 	/* For success case, the suspend path will release the lock */
  Unlock:
-	mutex_unlock(&pm_mutex);
+	mutex_unlock(&system_transition_mutex);
 	pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
 	return error;
  Close_Finish:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d9706da10930..35b50823d83b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,17 +15,16 @@
 #include <linux/workqueue.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/suspend.h>
 
 #include "power.h"
 
-DEFINE_MUTEX(pm_mutex);
-
 #ifdef CONFIG_PM_SLEEP
 
 void lock_system_sleep(void)
 {
 	current->flags |= PF_FREEZER_SKIP;
-	mutex_lock(&pm_mutex);
+	mutex_lock(&system_transition_mutex);
 }
 EXPORT_SYMBOL_GPL(lock_system_sleep);
 
@@ -37,8 +36,9 @@ void unlock_system_sleep(void)
 	 *
 	 * Reason:
 	 * Fundamentally, we just don't need it, because freezing condition
-	 * doesn't come into effect until we release the pm_mutex lock,
-	 * since the freezer always works with pm_mutex held.
+	 * doesn't come into effect until we release the
+	 * system_transition_mutex lock, since the freezer always works with
+	 * system_transition_mutex held.
 	 *
 	 * More importantly, in the case of hibernation,
 	 * unlock_system_sleep() gets called in snapshot_read() and
@@ -47,7 +47,7 @@ void unlock_system_sleep(void)
 	 * enter the refrigerator, thus causing hibernation to lockup.
 	 */
 	current->flags &= ~PF_FREEZER_SKIP;
-	mutex_unlock(&pm_mutex);
+	mutex_unlock(&system_transition_mutex);
 }
 EXPORT_SYMBOL_GPL(unlock_system_sleep);
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 87331565e505..9e13afe65a14 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -556,7 +556,7 @@ static int enter_state(suspend_state_t state)
 	} else if (!valid_state(state)) {
 		return -EINVAL;
 	}
-	if (!mutex_trylock(&pm_mutex))
+	if (!mutex_trylock(&system_transition_mutex))
 		return -EBUSY;
 
 	if (state == PM_SUSPEND_TO_IDLE)
@@ -590,7 +590,7 @@ static int enter_state(suspend_state_t state)
 	pm_pr_dbg("Finishing wakeup.\n");
 	suspend_finish();
  Unlock:
-	mutex_unlock(&pm_mutex);
+	mutex_unlock(&system_transition_mutex);
 	return error;
 }
 
diff --git a/kernel/power/user.c b/kernel/power/user.c
index abd225550271..2d8b60a3c86b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -216,7 +216,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!mutex_trylock(&pm_mutex))
+	if (!mutex_trylock(&system_transition_mutex))
 		return -EBUSY;
 
 	lock_device_hotplug();
@@ -394,7 +394,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 	}
 
 	unlock_device_hotplug();
-	mutex_unlock(&pm_mutex);
+	mutex_unlock(&system_transition_mutex);
 
 	return error;
 }
diff --git a/kernel/reboot.c b/kernel/reboot.c
index e4ced883d8de..8fb44dec9ad7 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -294,7 +294,7 @@ void kernel_power_off(void)
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
 
-static DEFINE_MUTEX(reboot_mutex);
+DEFINE_MUTEX(system_transition_mutex);
 
 /*
  * Reboot system call: for obvious reasons only root may call it,
@@ -338,7 +338,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
 		cmd = LINUX_REBOOT_CMD_HALT;
 
-	mutex_lock(&reboot_mutex);
+	mutex_lock(&system_transition_mutex);
 	switch (cmd) {
 	case LINUX_REBOOT_CMD_RESTART:
 		kernel_restart(NULL);
@@ -389,7 +389,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		ret = -EINVAL;
 		break;
 	}
-	mutex_unlock(&reboot_mutex);
+	mutex_unlock(&system_transition_mutex);
 	return ret;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1521100f1e63..e47f9e467408 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -155,16 +155,17 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype)
  * The following functions are used by the suspend/hibernate code to temporarily
  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
  * while devices are suspended.  To avoid races with the suspend/hibernate code,
- * they should always be called with pm_mutex held (gfp_allowed_mask also should
- * only be modified with pm_mutex held, unless the suspend/hibernate code is
- * guaranteed not to run in parallel with that modification).
+ * they should always be called with system_transition_mutex held
+ * (gfp_allowed_mask also should only be modified with system_transition_mutex
+ * held, unless the suspend/hibernate code is guaranteed not to run in parallel
+ * with that modification).
  */
 
 static gfp_t saved_gfp_mask;
 
 void pm_restore_gfp_mask(void)
 {
-	WARN_ON(!mutex_is_locked(&pm_mutex));
+	WARN_ON(!mutex_is_locked(&system_transition_mutex));
 	if (saved_gfp_mask) {
 		gfp_allowed_mask = saved_gfp_mask;
 		saved_gfp_mask = 0;
@@ -173,7 +174,7 @@ void pm_restore_gfp_mask(void)
 
 void pm_restrict_gfp_mask(void)
 {
-	WARN_ON(!mutex_is_locked(&pm_mutex));
+	WARN_ON(!mutex_is_locked(&system_transition_mutex));
 	WARN_ON(saved_gfp_mask);
 	saved_gfp_mask = gfp_allowed_mask;
 	gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);