From 0d4b54c6fee87ff60b0bc1007ca487449698468d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 18 Nov 2017 15:31:49 +0100
Subject: [PATCH 01/89] PM / core: Add LEAVE_SUSPENDED driver flag

Define and document a new driver flag, DPM_FLAG_LEAVE_SUSPENDED, to
instruct the PM core and middle-layer (bus type, PM domain, etc.)
code that it is desirable to leave the device in runtime suspend
after system-wide transitions to the working state (for example,
the device may be slow to resume and it may be better to avoid
resuming it right away).

Generally, the middle-layer code involved in the handling of the
device is expected to indicate to the PM core whether or not the
device may be left in suspend with the help of the device's
power.may_skip_resume status bit.  That has to happen in the "noirq"
phase of the preceding system suspend (or analogous) transition.
The middle layer is then responsible for handling the device as
appropriate in its "noirq" resume callback which is executed
regardless of whether or not the device may be left suspended, but
the other resume callbacks (except for ->complete) will be skipped
automatically by the core if the device really can be left in
suspend.

The additional power.must_resume status bit introduced for the
implementation of this mechanisn is used internally by the PM core
to track the requirement to resume the device (which may depend on
its children etc).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 Documentation/driver-api/pm/devices.rst | 27 ++++++++-
 drivers/base/power/main.c               | 73 +++++++++++++++++++++++--
 include/linux/pm.h                      | 16 ++++--
 3 files changed, 105 insertions(+), 11 deletions(-)

diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
index 53c1b0b06da5..b0fe63c91f8d 100644
--- a/Documentation/driver-api/pm/devices.rst
+++ b/Documentation/driver-api/pm/devices.rst
@@ -788,6 +788,29 @@ must reflect the "active" status for runtime PM in that case.
 
 During system-wide resume from a sleep state it's easiest to put devices into
 the full-power state, as explained in :file:`Documentation/power/runtime_pm.txt`.
-Refer to that document for more information regarding this particular issue as
+[Refer to that document for more information regarding this particular issue as
 well as for information on the device runtime power management framework in
-general.
+general.]
+
+However, it often is desirable to leave devices in suspend after system
+transitions to the working state, especially if those devices had been in
+runtime suspend before the preceding system-wide suspend (or analogous)
+transition.  Device drivers can use the ``DPM_FLAG_LEAVE_SUSPENDED`` flag to
+indicate to the PM core (and middle-layer code) that they prefer the specific
+devices handled by them to be left suspended and they have no problems with
+skipping their system-wide resume callbacks for this reason.  Whether or not the
+devices will actually be left in suspend may depend on their state before the
+given system suspend-resume cycle and on the type of the system transition under
+way.  In particular, devices are not left suspended if that transition is a
+restore from hibernation, as device states are not guaranteed to be reflected
+by the information stored in the hibernation image in that case.
+
+The middle-layer code involved in the handling of the device is expected to
+indicate to the PM core if the device may be left in suspend by setting its
+:c:member:`power.may_skip_resume` status bit which is checked by the PM core
+during the "noirq" phase of the preceding system-wide suspend (or analogous)
+transition.  The middle layer is then responsible for handling the device as
+appropriate in its "noirq" resume callback, which is executed regardless of
+whether or not the device is left suspended, but the other resume callbacks
+(except for ``->complete``) will be skipped automatically by the PM core if the
+device really can be left in suspend.
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index db2f04415927..73ec6796d9e1 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -525,6 +525,18 @@ static void dpm_watchdog_clear(struct dpm_watchdog *wd)
 
 /*------------------------- Resume routines -------------------------*/
 
+/**
+ * dev_pm_may_skip_resume - System-wide device resume optimization check.
+ * @dev: Target device.
+ *
+ * Checks whether or not the device may be left in suspend after a system-wide
+ * transition to the working state.
+ */
+bool dev_pm_may_skip_resume(struct device *dev)
+{
+	return !dev->power.must_resume && pm_transition.event != PM_EVENT_RESTORE;
+}
+
 /**
  * device_resume_noirq - Execute a "noirq resume" callback for given device.
  * @dev: Device to handle.
@@ -573,6 +585,19 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn
 	error = dpm_run_callback(callback, dev, state, info);
 	dev->power.is_noirq_suspended = false;
 
+	if (dev_pm_may_skip_resume(dev)) {
+		/*
+		 * The device is going to be left in suspend, but it might not
+		 * have been in runtime suspend before the system suspended, so
+		 * its runtime PM status needs to be updated to avoid confusing
+		 * the runtime PM framework when runtime PM is enabled for the
+		 * device again.
+		 */
+		pm_runtime_set_suspended(dev);
+		dev->power.is_late_suspended = false;
+		dev->power.is_suspended = false;
+	}
+
  Out:
 	complete_all(&dev->power.completion);
 	TRACE_RESUME(error);
@@ -1074,6 +1099,22 @@ static pm_message_t resume_event(pm_message_t sleep_state)
 	return PMSG_ON;
 }
 
+static void dpm_superior_set_must_resume(struct device *dev)
+{
+	struct device_link *link;
+	int idx;
+
+	if (dev->parent)
+		dev->parent->power.must_resume = true;
+
+	idx = device_links_read_lock();
+
+	list_for_each_entry_rcu(link, &dev->links.suppliers, c_node)
+		link->supplier->power.must_resume = true;
+
+	device_links_read_unlock(idx);
+}
+
 /**
  * __device_suspend_noirq - Execute a "noirq suspend" callback for given device.
  * @dev: Device to handle.
@@ -1125,10 +1166,28 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 	}
 
 	error = dpm_run_callback(callback, dev, state, info);
-	if (!error)
-		dev->power.is_noirq_suspended = true;
-	else
+	if (error) {
 		async_error = error;
+		goto Complete;
+	}
+
+	dev->power.is_noirq_suspended = true;
+
+	if (dev_pm_test_driver_flags(dev, DPM_FLAG_LEAVE_SUSPENDED)) {
+		/*
+		 * The only safe strategy here is to require that if the device
+		 * may not be left in suspend, resume callbacks must be invoked
+		 * for it.
+		 */
+		dev->power.must_resume = dev->power.must_resume ||
+					!dev->power.may_skip_resume ||
+					atomic_read(&dev->power.usage_count) > 1;
+	} else {
+		dev->power.must_resume = true;
+	}
+
+	if (dev->power.must_resume)
+		dpm_superior_set_must_resume(dev);
 
 Complete:
 	complete_all(&dev->power.completion);
@@ -1485,6 +1544,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 		dev->power.direct_complete = false;
 	}
 
+	dev->power.may_skip_resume = false;
+	dev->power.must_resume = false;
+
 	dpm_watchdog_set(&wd, dev);
 	device_lock(dev);
 
@@ -1650,8 +1712,9 @@ static int device_prepare(struct device *dev, pm_message_t state)
 	if (dev->power.syscore)
 		return 0;
 
-	WARN_ON(dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) &&
-		!pm_runtime_enabled(dev));
+	WARN_ON(!pm_runtime_enabled(dev) &&
+		dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND |
+					      DPM_FLAG_LEAVE_SUSPENDED));
 
 	/*
 	 * If a device's parent goes into runtime suspend at the wrong time,
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 65d39115f06d..b5a40b713e9e 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -556,9 +556,10 @@ struct pm_subsys_data {
  * These flags can be set by device drivers at the probe time.  They need not be
  * cleared by the drivers as the driver core will take care of that.
  *
- * NEVER_SKIP: Do not skip system suspend/resume callbacks for the device.
+ * NEVER_SKIP: Do not skip all system suspend/resume callbacks for the device.
  * SMART_PREPARE: Check the return value of the driver's ->prepare callback.
  * SMART_SUSPEND: No need to resume the device from runtime suspend.
+ * LEAVE_SUSPENDED: Avoid resuming the device during system resume if possible.
  *
  * Setting SMART_PREPARE instructs bus types and PM domains which may want
  * system suspend/resume callbacks to be skipped for the device to return 0 from
@@ -572,10 +573,14 @@ struct pm_subsys_data {
  * necessary from the driver's perspective.  It also may cause them to skip
  * invocations of the ->suspend_late and ->suspend_noirq callbacks provided by
  * the driver if they decide to leave the device in runtime suspend.
+ *
+ * Setting LEAVE_SUSPENDED informs the PM core and middle-layer code that the
+ * driver prefers the device to be left in suspend after system resume.
  */
-#define DPM_FLAG_NEVER_SKIP	BIT(0)
-#define DPM_FLAG_SMART_PREPARE	BIT(1)
-#define DPM_FLAG_SMART_SUSPEND	BIT(2)
+#define DPM_FLAG_NEVER_SKIP		BIT(0)
+#define DPM_FLAG_SMART_PREPARE		BIT(1)
+#define DPM_FLAG_SMART_SUSPEND		BIT(2)
+#define DPM_FLAG_LEAVE_SUSPENDED	BIT(3)
 
 struct dev_pm_info {
 	pm_message_t		power_state;
@@ -597,6 +602,8 @@ struct dev_pm_info {
 	bool			wakeup_path:1;
 	bool			syscore:1;
 	bool			no_pm_callbacks:1;	/* Owned by the PM core */
+	unsigned int		must_resume:1;	/* Owned by the PM core */
+	unsigned int		may_skip_resume:1;	/* Set by subsystems */
 #else
 	unsigned int		should_wakeup:1;
 #endif
@@ -765,6 +772,7 @@ extern int pm_generic_poweroff_late(struct device *dev);
 extern int pm_generic_poweroff(struct device *dev);
 extern void pm_generic_complete(struct device *dev);
 
+extern bool dev_pm_may_skip_resume(struct device *dev);
 extern bool dev_pm_smart_suspend_and_suspended(struct device *dev);
 
 #else /* !CONFIG_PM_SLEEP */

From bd755d770ac78e8eeda05877ba66cc66f151e10e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 18 Nov 2017 15:33:52 +0100
Subject: [PATCH 02/89] PCI / PM: Support for LEAVE_SUSPENDED driver flag

Add support for DPM_FLAG_LEAVE_SUSPENDED to the PCI bus type by
making it (a) set the power.may_skip_resume status bit for devices
that, from its perspective, may be left in suspend after system
wakeup from sleep and (b) return early from pci_pm_resume_noirq()
for devices whose remaining resume callbacks during the transition
under way are going to be skipped by the PM core.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 Documentation/power/pci.txt | 11 +++++++++++
 drivers/pci/pci-driver.c    | 19 +++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
index 704cd36079b8..8eaf9ee24d43 100644
--- a/Documentation/power/pci.txt
+++ b/Documentation/power/pci.txt
@@ -994,6 +994,17 @@ into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
 the function will set the power.direct_complete flag for it (to make the PM core
 skip the subsequent "thaw" callbacks for it) and return.
 
+Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the
+device to be left in suspend after system-wide transitions to the working state.
+This flag is checked by the PM core, but the PCI bus type informs the PM core
+which devices may be left in suspend from its perspective (that happens during
+the "noirq" phase of system-wide suspend and analogous transitions) and next it
+uses the dev_pm_may_skip_resume() helper to decide whether or not to return from
+pci_pm_resume_noirq() early, as the PM core will skip the remaining resume
+callbacks for the device during the transition under way and will set its
+runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for
+it.
+
 3.2. Device Runtime Power Management
 ------------------------------------
 In addition to providing device power management callbacks PCI device drivers
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 7f47bb72bf30..3cf2da22acf2 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -699,7 +699,7 @@ static void pci_pm_complete(struct device *dev)
 	pm_generic_complete(dev);
 
 	/* Resume device if platform firmware has put it in reset-power-on */
-	if (dev->power.direct_complete && pm_resume_via_firmware()) {
+	if (pm_runtime_suspended(dev) && pm_resume_via_firmware()) {
 		pci_power_t pre_sleep_state = pci_dev->current_state;
 
 		pci_update_current_state(pci_dev, pci_dev->current_state);
@@ -783,8 +783,10 @@ static int pci_pm_suspend_noirq(struct device *dev)
 	struct pci_dev *pci_dev = to_pci_dev(dev);
 	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
-	if (dev_pm_smart_suspend_and_suspended(dev))
+	if (dev_pm_smart_suspend_and_suspended(dev)) {
+		dev->power.may_skip_resume = true;
 		return 0;
+	}
 
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_suspend_late(dev, PMSG_SUSPEND);
@@ -838,6 +840,16 @@ static int pci_pm_suspend_noirq(struct device *dev)
 Fixup:
 	pci_fixup_device(pci_fixup_suspend_late, pci_dev);
 
+	/*
+	 * If the target system sleep state is suspend-to-idle, it is sufficient
+	 * to check whether or not the device's wakeup settings are good for
+	 * runtime PM.  Otherwise, the pm_resume_via_firmware() check will cause
+	 * pci_pm_complete() to take care of fixing up the device's state
+	 * anyway, if need be.
+	 */
+	dev->power.may_skip_resume = device_may_wakeup(dev) ||
+					!device_can_wakeup(dev);
+
 	return 0;
 }
 
@@ -847,6 +859,9 @@ static int pci_pm_resume_noirq(struct device *dev)
 	struct device_driver *drv = dev->driver;
 	int error = 0;
 
+	if (dev_pm_may_skip_resume(dev))
+		return 0;
+
 	/*
 	 * Devices with DPM_FLAG_SMART_SUSPEND may be left in runtime suspend
 	 * during system suspend, so update their runtime PM status to "active"

From db68daff90ef79761cc0bba16f775b6027ea3a83 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 18 Nov 2017 15:35:00 +0100
Subject: [PATCH 03/89] ACPI / PM: Support for LEAVE_SUSPENDED driver flag in
 ACPI PM domain

Add support for DPM_FLAG_LEAVE_SUSPENDED to the ACPI PM domain by
making it (a) set the power.may_skip_resume status bit for devices
that, from its perspective, may be left in suspend after system
wakeup from sleep and (b) return early from acpi_subsys_resume_noirq()
for devices whose remaining resume callbacks during the transition
under way are going to be skipped by the PM core.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/acpi/device_pm.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index e4ffaeec9ec2..5cfe794c36bd 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -990,7 +990,7 @@ void acpi_subsys_complete(struct device *dev)
 	 * the sleep state it is going out of and it has never been resumed till
 	 * now, resume it in case the firmware powered it up.
 	 */
-	if (dev->power.direct_complete && pm_resume_via_firmware())
+	if (pm_runtime_suspended(dev) && pm_resume_via_firmware())
 		pm_request_resume(dev);
 }
 EXPORT_SYMBOL_GPL(acpi_subsys_complete);
@@ -1039,10 +1039,28 @@ EXPORT_SYMBOL_GPL(acpi_subsys_suspend_late);
  */
 int acpi_subsys_suspend_noirq(struct device *dev)
 {
-	if (dev_pm_smart_suspend_and_suspended(dev))
-		return 0;
+	int ret;
 
-	return pm_generic_suspend_noirq(dev);
+	if (dev_pm_smart_suspend_and_suspended(dev)) {
+		dev->power.may_skip_resume = true;
+		return 0;
+	}
+
+	ret = pm_generic_suspend_noirq(dev);
+	if (ret)
+		return ret;
+
+	/*
+	 * If the target system sleep state is suspend-to-idle, it is sufficient
+	 * to check whether or not the device's wakeup settings are good for
+	 * runtime PM.  Otherwise, the pm_resume_via_firmware() check will cause
+	 * acpi_subsys_complete() to take care of fixing up the device's state
+	 * anyway, if need be.
+	 */
+	dev->power.may_skip_resume = device_may_wakeup(dev) ||
+					!device_can_wakeup(dev);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(acpi_subsys_suspend_noirq);
 
@@ -1052,6 +1070,9 @@ EXPORT_SYMBOL_GPL(acpi_subsys_suspend_noirq);
  */
 int acpi_subsys_resume_noirq(struct device *dev)
 {
+	if (dev_pm_may_skip_resume(dev))
+		return 0;
+
 	/*
 	 * Devices with DPM_FLAG_SMART_SUSPEND may be left in runtime suspend
 	 * during system suspend, so update their runtime PM status to "active"

From 57044031b0cb11325e1034394a4721484f9dc9fe Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 15 Nov 2017 02:16:55 +0100
Subject: [PATCH 04/89] ACPI / PM: Make it possible to ignore the system sleep
 blacklist

The ACPI code supporting system transitions to sleep states uses
an internal blacklist to apply special handling to some machines
reported to behave incorrectly in some ways.

However, some entries of that blacklist cover problematic as well as
non-problematic systems, so give the users of the latter a chance to
ignore the blacklist and run their systems in the default way by
adding acpi_sleep=nobl to the kernel command line.

For example, that allows the users of Dell XPS13 9360 systems not
affected by the issue that caused the blacklist entry for this
machine to be added by commit 71630b7a832f (ACPI / PM: Blacklist Low
Power S0 Idle _DSM for Dell XPS13 9360) to use suspend-to-idle with
the Low Power S0 Idle _DSM interface which in principle should be
more energy-efficient than S3 on them.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  5 ++++-
 arch/x86/kernel/acpi/sleep.c                    |  2 ++
 drivers/acpi/sleep.c                            | 10 ++++++++++
 include/linux/acpi.h                            |  1 +
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6571fbfdb2a1..b125690d5dbc 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -223,7 +223,7 @@
 
 	acpi_sleep=	[HW,ACPI] Sleep options
 			Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
-				  old_ordering, nonvs, sci_force_enable }
+				  old_ordering, nonvs, sci_force_enable, nobl }
 			See Documentation/power/video.txt for information on
 			s3_bios and s3_mode.
 			s3_beep is for debugging; it makes the PC's speaker beep
@@ -239,6 +239,9 @@
 			sci_force_enable causes the kernel to set SCI_EN directly
 			on resume from S1/S3 (which is against the ACPI spec,
 			but some broken systems don't work without it).
+			nobl causes the internal blacklist of systems known to
+			behave incorrectly in some ways with respect to system
+			suspend and resume to be ignored (use wisely).
 
 	acpi_use_timer_override [HW,ACPI]
 			Use timer override. For some broken Nvidia NF5 boards
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 7188aea91549..f1915b744052 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -138,6 +138,8 @@ static int __init acpi_sleep_setup(char *str)
 			acpi_nvs_nosave_s3();
 		if (strncmp(str, "old_ordering", 12) == 0)
 			acpi_old_suspend_ordering();
+		if (strncmp(str, "nobl", 4) == 0)
+			acpi_sleep_no_blacklist();
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 8082871b409a..15cd862a87c2 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -367,10 +367,20 @@ static const struct dmi_system_id acpisleep_dmi_table[] __initconst = {
 	{},
 };
 
+static bool ignore_blacklist;
+
+void __init acpi_sleep_no_blacklist(void)
+{
+	ignore_blacklist = true;
+}
+
 static void __init acpi_sleep_dmi_check(void)
 {
 	int year;
 
+	if (ignore_blacklist)
+		return;
+
 	if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year >= 2012)
 		acpi_nvs_nosave_s3();
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index dc1ebfeeb5ec..699655a9618b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -451,6 +451,7 @@ void __init acpi_no_s4_hw_signature(void);
 void __init acpi_old_suspend_ordering(void);
 void __init acpi_nvs_nosave(void);
 void __init acpi_nvs_nosave_s3(void);
+void __init acpi_sleep_no_blacklist(void);
 #endif /* CONFIG_PM_SLEEP */
 
 struct acpi_osc_context {

From 4ab53fe612e21b0f509a3b468c56706364de98df Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 29 Nov 2017 11:12:27 +0000
Subject: [PATCH 05/89] PM: Provide a config snippet for disabling PM

A frequent source of build problems is poor handling of optional PM
support, almost all development is done with the PM options enabled
but they can be turned off.  Currently few if any of the build test
services do this as standard as there is no standard config for it and
the use of selects and def_bool means that simply setting CONFIG_PM=n
doesn't do what is expected.  To make this easier provide a fragement
that can be used with KCONFIG_ALLCONFIG to force PM off.

CONFIG_XEN is disabled as Xen uses hibernation callbacks which end up
turning on power management on architectures with Xen.  Some cpuidle
implementations on ARM select PM so CONFIG_CPU_IDLE is disabled, and
some ARM architectures unconditionally enable PM so they are also
disabled.

Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS                |  1 +
 kernel/configs/nopm.config | 15 +++++++++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 kernel/configs/nopm.config

diff --git a/MAINTAINERS b/MAINTAINERS
index d4fdcb12616c..ae6e590c5125 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10890,6 +10890,7 @@ F:	include/linux/pm.h
 F:	include/linux/pm_*
 F:	include/linux/powercap.h
 F:	drivers/powercap/
+F:	kernel/configs/nopm.config
 
 POWER STATE COORDINATION INTERFACE (PSCI)
 M:	Mark Rutland <mark.rutland@arm.com>
diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config
new file mode 100644
index 000000000000..81ff07863576
--- /dev/null
+++ b/kernel/configs/nopm.config
@@ -0,0 +1,15 @@
+CONFIG_PM=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+
+# Triggers PM on OMAP
+CONFIG_CPU_IDLE=n
+
+# Triggers enablement via hibernate callbacks
+CONFIG_XEN=n
+
+# ARM/ARM64 architectures that select PM unconditionally
+CONFIG_ARCH_OMAP2PLUS_TYPICAL=n
+CONFIG_ARCH_RENESAS=n
+CONFIG_ARCH_TEGRA=n
+CONFIG_ARCH_VEXPRESS=n

From 045149e6a22119e5bf0d16a0b24a4173a2abb71d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 23 Nov 2017 01:23:16 +0100
Subject: [PATCH 06/89] cpufreq: Clean up cpufreq_parse_governor()

Drop an unnecessary local variable from cpufreq_parse_governor()
and rearrange the code in there to make it easier to follow.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 41d148af7748..4d76b7c57b7a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -604,16 +604,15 @@ static struct cpufreq_governor *find_governor(const char *str_governor)
 static int cpufreq_parse_governor(char *str_governor, unsigned int *policy,
 				struct cpufreq_governor **governor)
 {
-	int err = -EINVAL;
-
 	if (cpufreq_driver->setpolicy) {
 		if (!strncasecmp(str_governor, "performance", CPUFREQ_NAME_LEN)) {
 			*policy = CPUFREQ_POLICY_PERFORMANCE;
-			err = 0;
-		} else if (!strncasecmp(str_governor, "powersave",
-						CPUFREQ_NAME_LEN)) {
+			return 0;
+		}
+
+		if (!strncasecmp(str_governor, "powersave", CPUFREQ_NAME_LEN)) {
 			*policy = CPUFREQ_POLICY_POWERSAVE;
-			err = 0;
+			return 0;
 		}
 	} else {
 		struct cpufreq_governor *t;
@@ -621,26 +620,29 @@ static int cpufreq_parse_governor(char *str_governor, unsigned int *policy,
 		mutex_lock(&cpufreq_governor_mutex);
 
 		t = find_governor(str_governor);
-
-		if (t == NULL) {
+		if (!t) {
 			int ret;
 
 			mutex_unlock(&cpufreq_governor_mutex);
+
 			ret = request_module("cpufreq_%s", str_governor);
+			if (ret)
+				return -EINVAL;
+
 			mutex_lock(&cpufreq_governor_mutex);
 
-			if (ret == 0)
-				t = find_governor(str_governor);
-		}
-
-		if (t != NULL) {
-			*governor = t;
-			err = 0;
+			t = find_governor(str_governor);
 		}
 
 		mutex_unlock(&cpufreq_governor_mutex);
+
+		if (t) {
+			*governor = t;
+			return 0;
+		}
 	}
-	return err;
+
+	return -EINVAL;
 }
 
 /**

From ae0ff89f36b282ef32ff0f73e847352ea625464c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 23 Nov 2017 01:24:05 +0100
Subject: [PATCH 07/89] cpufreq: Pass policy pointer to
 cpufreq_parse_governor()

Pass policy pointer to cpufreq_parse_governor() instead of passing
pointers to two members of it so as to make the code slightly more
straightforward.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 4d76b7c57b7a..8f356c4befda 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -601,17 +601,17 @@ static struct cpufreq_governor *find_governor(const char *str_governor)
 /**
  * cpufreq_parse_governor - parse a governor string
  */
-static int cpufreq_parse_governor(char *str_governor, unsigned int *policy,
-				struct cpufreq_governor **governor)
+static int cpufreq_parse_governor(char *str_governor,
+				  struct cpufreq_policy *policy)
 {
 	if (cpufreq_driver->setpolicy) {
 		if (!strncasecmp(str_governor, "performance", CPUFREQ_NAME_LEN)) {
-			*policy = CPUFREQ_POLICY_PERFORMANCE;
+			policy->policy = CPUFREQ_POLICY_PERFORMANCE;
 			return 0;
 		}
 
 		if (!strncasecmp(str_governor, "powersave", CPUFREQ_NAME_LEN)) {
-			*policy = CPUFREQ_POLICY_POWERSAVE;
+			policy->policy = CPUFREQ_POLICY_POWERSAVE;
 			return 0;
 		}
 	} else {
@@ -637,7 +637,7 @@ static int cpufreq_parse_governor(char *str_governor, unsigned int *policy,
 		mutex_unlock(&cpufreq_governor_mutex);
 
 		if (t) {
-			*governor = t;
+			policy->governor = t;
 			return 0;
 		}
 	}
@@ -762,8 +762,7 @@ static ssize_t store_scaling_governor(struct cpufreq_policy *policy,
 	if (ret != 1)
 		return -EINVAL;
 
-	if (cpufreq_parse_governor(str_governor, &new_policy.policy,
-						&new_policy.governor))
+	if (cpufreq_parse_governor(str_governor, &new_policy))
 		return -EINVAL;
 
 	ret = cpufreq_set_policy(policy, &new_policy);
@@ -1046,8 +1045,7 @@ static int cpufreq_init_policy(struct cpufreq_policy *policy)
 		if (policy->last_policy)
 			new_policy.policy = policy->last_policy;
 		else
-			cpufreq_parse_governor(gov->name, &new_policy.policy,
-					       NULL);
+			cpufreq_parse_governor(gov->name, &new_policy);
 	}
 	/* set default policy */
 	return cpufreq_set_policy(policy, &new_policy);

From 70d1ff71161b1c56c6d025e6a957bc878dfd940b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 23 Nov 2017 01:30:16 +0100
Subject: [PATCH 08/89] cpufreq: Drop pointless return statement

Drop a pointless return statement from cpufreq_unregister_governor().

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 8f356c4befda..d2a22de4e4d2 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2160,7 +2160,6 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor)
 	mutex_lock(&cpufreq_governor_mutex);
 	list_del(&governor->governor_list);
 	mutex_unlock(&cpufreq_governor_mutex);
-	return;
 }
 EXPORT_SYMBOL_GPL(cpufreq_unregister_governor);
 

From a8b149d32b663c1a4105273295184b78f53d33cf Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 23 Nov 2017 14:27:07 +0100
Subject: [PATCH 09/89] cpufreq: Fix governor module removal race

It is possible to remove a cpufreq governor module after
cpufreq_parse_governor() has returned success in
store_scaling_governor() and before cpufreq_set_policy()
acquires a reference to it, because the governor list is
not protected during that period and nothing prevents the
governor from being unregistered then.

Prevent that from happening by acquiring an extra reference
to the governor module temporarily in cpufreq_parse_governor(),
under cpufreq_governor_mutex, and dropping it in
store_scaling_governor(), when cpufreq_set_policy() returns.

Note that the second cpufreq_parse_governor() call site is fine,
because it only cares about the policy member of new_policy.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index d2a22de4e4d2..421f318c0e66 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -633,6 +633,8 @@ static int cpufreq_parse_governor(char *str_governor,
 
 			t = find_governor(str_governor);
 		}
+		if (t && !try_module_get(t->owner))
+			t = NULL;
 
 		mutex_unlock(&cpufreq_governor_mutex);
 
@@ -766,6 +768,10 @@ static ssize_t store_scaling_governor(struct cpufreq_policy *policy,
 		return -EINVAL;
 
 	ret = cpufreq_set_policy(policy, &new_policy);
+
+	if (new_policy.governor)
+		module_put(new_policy.governor->owner);
+
 	return ret ? ret : count;
 }
 

From 325c4b3b81027068914854adcba4e97200c809df Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 10 Nov 2017 20:28:07 +0200
Subject: [PATCH 10/89] PM / sysfs: Convert to use sysfs_streq()

...instead of custom approach.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/sysfs.c | 39 +++++++++-----------------------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index e153e28b1857..662632ac5e0e 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -108,16 +108,10 @@ static ssize_t control_show(struct device *dev, struct device_attribute *attr,
 static ssize_t control_store(struct device * dev, struct device_attribute *attr,
 			     const char * buf, size_t n)
 {
-	char *cp;
-	int len = n;
-
-	cp = memchr(buf, '\n', n);
-	if (cp)
-		len = cp - buf;
 	device_lock(dev);
-	if (len == sizeof ctrl_auto - 1 && strncmp(buf, ctrl_auto, len) == 0)
+	if (sysfs_streq(buf, ctrl_auto))
 		pm_runtime_allow(dev);
-	else if (len == sizeof ctrl_on - 1 && strncmp(buf, ctrl_on, len) == 0)
+	else if (sysfs_streq(buf, ctrl_on))
 		pm_runtime_forbid(dev);
 	else
 		n = -EINVAL;
@@ -245,7 +239,7 @@ static ssize_t pm_qos_resume_latency_store(struct device *dev,
 
 		if (value == 0)
 			value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
-	} else if (!strcmp(buf, "n/a") || !strcmp(buf, "n/a\n")) {
+	} else if (sysfs_streq(buf, "n/a")) {
 		value = 0;
 	} else {
 		return -EINVAL;
@@ -285,9 +279,9 @@ static ssize_t pm_qos_latency_tolerance_store(struct device *dev,
 		if (value < 0)
 			return -EINVAL;
 	} else {
-		if (!strcmp(buf, "auto") || !strcmp(buf, "auto\n"))
+		if (sysfs_streq(buf, "auto"))
 			value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
-		else if (!strcmp(buf, "any") || !strcmp(buf, "any\n"))
+		else if (sysfs_streq(buf, "any"))
 			value = PM_QOS_LATENCY_ANY;
 		else
 			return -EINVAL;
@@ -342,20 +336,12 @@ static ssize_t
 wake_store(struct device * dev, struct device_attribute *attr,
 	const char * buf, size_t n)
 {
-	char *cp;
-	int len = n;
-
 	if (!device_can_wakeup(dev))
 		return -EINVAL;
 
-	cp = memchr(buf, '\n', n);
-	if (cp)
-		len = cp - buf;
-	if (len == sizeof _enabled - 1
-			&& strncmp(buf, _enabled, sizeof _enabled - 1) == 0)
+	if (sysfs_streq(buf, _enabled))
 		device_set_wakeup_enable(dev, 1);
-	else if (len == sizeof _disabled - 1
-			&& strncmp(buf, _disabled, sizeof _disabled - 1) == 0)
+	else if (sysfs_streq(buf, _disabled))
 		device_set_wakeup_enable(dev, 0);
 	else
 		return -EINVAL;
@@ -566,16 +552,9 @@ static ssize_t async_show(struct device *dev, struct device_attribute *attr,
 static ssize_t async_store(struct device *dev, struct device_attribute *attr,
 			   const char *buf, size_t n)
 {
-	char *cp;
-	int len = n;
-
-	cp = memchr(buf, '\n', n);
-	if (cp)
-		len = cp - buf;
-	if (len == sizeof _enabled - 1 && strncmp(buf, _enabled, len) == 0)
+	if (sysfs_streq(buf, _enabled))
 		device_enable_async_suspend(dev);
-	else if (len == sizeof _disabled - 1 &&
-		 strncmp(buf, _disabled, len) == 0)
+	else if (sysfs_streq(buf, _disabled))
 		device_disable_async_suspend(dev);
 	else
 		return -EINVAL;

From f0e6d9f164c2269df69b6d2fe05c285392a6a0d4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 10 Nov 2017 20:28:08 +0200
Subject: [PATCH 11/89] PM / sysfs: Remove redundant 'else' keyword.

There is no need to use 'else' if in main branch 'return' is present.

No functional change intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/sysfs.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index 662632ac5e0e..1bf5e163ef1f 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -216,7 +216,7 @@ static ssize_t pm_qos_resume_latency_show(struct device *dev,
 
 	if (value == 0)
 		return sprintf(buf, "n/a\n");
-	else if (value == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT)
+	if (value == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT)
 		value = 0;
 
 	return sprintf(buf, "%d\n", value);
@@ -261,7 +261,7 @@ static ssize_t pm_qos_latency_tolerance_show(struct device *dev,
 
 	if (value < 0)
 		return sprintf(buf, "auto\n");
-	else if (value == PM_QOS_LATENCY_ANY)
+	if (value == PM_QOS_LATENCY_ANY)
 		return sprintf(buf, "any\n");
 
 	return sprintf(buf, "%d\n", value);
@@ -527,11 +527,11 @@ static ssize_t rtpm_children_show(struct device *dev,
 static ssize_t rtpm_enabled_show(struct device *dev,
 				 struct device_attribute *attr, char *buf)
 {
-	if ((dev->power.disable_depth) && (dev->power.runtime_auto == false))
+	if (dev->power.disable_depth && (dev->power.runtime_auto == false))
 		return sprintf(buf, "disabled & forbidden\n");
-	else if (dev->power.disable_depth)
+	if (dev->power.disable_depth)
 		return sprintf(buf, "disabled\n");
-	else if (dev->power.runtime_auto == false)
+	if (dev->power.runtime_auto == false)
 		return sprintf(buf, "forbidden\n");
 	return sprintf(buf, "enabled\n");
 }

From 47acbd77e6e481abf2f41d3a99cb3762f296b2e6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 10 Nov 2017 20:28:09 +0200
Subject: [PATCH 12/89] PM / sysfs: Convert to use DEVICE_ATTR_RO /
 DEVICE_ATTR_RW

Use DEVICE_ATTR_RO() and DEVICE_ATTR_RW() macros instead of
open coding them.

No functional change intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/sysfs.c | 133 ++++++++++++++++++-------------------
 1 file changed, 65 insertions(+), 68 deletions(-)

diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index 1bf5e163ef1f..0f651efc58a1 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -119,9 +119,9 @@ static ssize_t control_store(struct device * dev, struct device_attribute *attr,
 	return n;
 }
 
-static DEVICE_ATTR(control, 0644, control_show, control_store);
+static DEVICE_ATTR_RW(control);
 
-static ssize_t rtpm_active_time_show(struct device *dev,
+static ssize_t runtime_active_time_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	int ret;
@@ -132,9 +132,9 @@ static ssize_t rtpm_active_time_show(struct device *dev,
 	return ret;
 }
 
-static DEVICE_ATTR(runtime_active_time, 0444, rtpm_active_time_show, NULL);
+static DEVICE_ATTR_RO(runtime_active_time);
 
-static ssize_t rtpm_suspended_time_show(struct device *dev,
+static ssize_t runtime_suspended_time_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	int ret;
@@ -146,9 +146,9 @@ static ssize_t rtpm_suspended_time_show(struct device *dev,
 	return ret;
 }
 
-static DEVICE_ATTR(runtime_suspended_time, 0444, rtpm_suspended_time_show, NULL);
+static DEVICE_ATTR_RO(runtime_suspended_time);
 
-static ssize_t rtpm_status_show(struct device *dev,
+static ssize_t runtime_status_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	const char *p;
@@ -178,7 +178,7 @@ static ssize_t rtpm_status_show(struct device *dev,
 	return sprintf(buf, p);
 }
 
-static DEVICE_ATTR(runtime_status, 0444, rtpm_status_show, NULL);
+static DEVICE_ATTR_RO(runtime_status);
 
 static ssize_t autosuspend_delay_ms_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
@@ -205,12 +205,11 @@ static ssize_t autosuspend_delay_ms_store(struct device *dev,
 	return n;
 }
 
-static DEVICE_ATTR(autosuspend_delay_ms, 0644, autosuspend_delay_ms_show,
-		autosuspend_delay_ms_store);
+static DEVICE_ATTR_RW(autosuspend_delay_ms);
 
-static ssize_t pm_qos_resume_latency_show(struct device *dev,
-					  struct device_attribute *attr,
-					  char *buf)
+static ssize_t pm_qos_resume_latency_us_show(struct device *dev,
+					     struct device_attribute *attr,
+					     char *buf)
 {
 	s32 value = dev_pm_qos_requested_resume_latency(dev);
 
@@ -222,9 +221,9 @@ static ssize_t pm_qos_resume_latency_show(struct device *dev,
 	return sprintf(buf, "%d\n", value);
 }
 
-static ssize_t pm_qos_resume_latency_store(struct device *dev,
-					   struct device_attribute *attr,
-					   const char *buf, size_t n)
+static ssize_t pm_qos_resume_latency_us_store(struct device *dev,
+					      struct device_attribute *attr,
+					      const char *buf, size_t n)
 {
 	s32 value;
 	int ret;
@@ -250,12 +249,11 @@ static ssize_t pm_qos_resume_latency_store(struct device *dev,
 	return ret < 0 ? ret : n;
 }
 
-static DEVICE_ATTR(pm_qos_resume_latency_us, 0644,
-		   pm_qos_resume_latency_show, pm_qos_resume_latency_store);
+static DEVICE_ATTR_RW(pm_qos_resume_latency_us);
 
-static ssize_t pm_qos_latency_tolerance_show(struct device *dev,
-					     struct device_attribute *attr,
-					     char *buf)
+static ssize_t pm_qos_latency_tolerance_us_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
 {
 	s32 value = dev_pm_qos_get_user_latency_tolerance(dev);
 
@@ -267,9 +265,9 @@ static ssize_t pm_qos_latency_tolerance_show(struct device *dev,
 	return sprintf(buf, "%d\n", value);
 }
 
-static ssize_t pm_qos_latency_tolerance_store(struct device *dev,
-					      struct device_attribute *attr,
-					      const char *buf, size_t n)
+static ssize_t pm_qos_latency_tolerance_us_store(struct device *dev,
+						 struct device_attribute *attr,
+						 const char *buf, size_t n)
 {
 	s32 value;
 	int ret;
@@ -290,8 +288,7 @@ static ssize_t pm_qos_latency_tolerance_store(struct device *dev,
 	return ret < 0 ? ret : n;
 }
 
-static DEVICE_ATTR(pm_qos_latency_tolerance_us, 0644,
-		   pm_qos_latency_tolerance_show, pm_qos_latency_tolerance_store);
+static DEVICE_ATTR_RW(pm_qos_latency_tolerance_us);
 
 static ssize_t pm_qos_no_power_off_show(struct device *dev,
 					struct device_attribute *attr,
@@ -317,24 +314,22 @@ static ssize_t pm_qos_no_power_off_store(struct device *dev,
 	return ret < 0 ? ret : n;
 }
 
-static DEVICE_ATTR(pm_qos_no_power_off, 0644,
-		   pm_qos_no_power_off_show, pm_qos_no_power_off_store);
+static DEVICE_ATTR_RW(pm_qos_no_power_off);
 
 #ifdef CONFIG_PM_SLEEP
 static const char _enabled[] = "enabled";
 static const char _disabled[] = "disabled";
 
-static ssize_t
-wake_show(struct device * dev, struct device_attribute *attr, char * buf)
+static ssize_t wakeup_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
 {
 	return sprintf(buf, "%s\n", device_can_wakeup(dev)
 		? (device_may_wakeup(dev) ? _enabled : _disabled)
 		: "");
 }
 
-static ssize_t
-wake_store(struct device * dev, struct device_attribute *attr,
-	const char * buf, size_t n)
+static ssize_t wakeup_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t n)
 {
 	if (!device_can_wakeup(dev))
 		return -EINVAL;
@@ -348,10 +343,10 @@ wake_store(struct device * dev, struct device_attribute *attr,
 	return n;
 }
 
-static DEVICE_ATTR(wakeup, 0644, wake_show, wake_store);
+static DEVICE_ATTR_RW(wakeup);
 
 static ssize_t wakeup_count_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
+				 struct device_attribute *attr, char *buf)
 {
 	unsigned long count = 0;
 	bool enabled = false;
@@ -365,10 +360,11 @@ static ssize_t wakeup_count_show(struct device *dev,
 	return enabled ? sprintf(buf, "%lu\n", count) : sprintf(buf, "\n");
 }
 
-static DEVICE_ATTR(wakeup_count, 0444, wakeup_count_show, NULL);
+static DEVICE_ATTR_RO(wakeup_count);
 
 static ssize_t wakeup_active_count_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
+					struct device_attribute *attr,
+					char *buf)
 {
 	unsigned long count = 0;
 	bool enabled = false;
@@ -382,11 +378,11 @@ static ssize_t wakeup_active_count_show(struct device *dev,
 	return enabled ? sprintf(buf, "%lu\n", count) : sprintf(buf, "\n");
 }
 
-static DEVICE_ATTR(wakeup_active_count, 0444, wakeup_active_count_show, NULL);
+static DEVICE_ATTR_RO(wakeup_active_count);
 
 static ssize_t wakeup_abort_count_show(struct device *dev,
-					struct device_attribute *attr,
-					char *buf)
+				       struct device_attribute *attr,
+				       char *buf)
 {
 	unsigned long count = 0;
 	bool enabled = false;
@@ -400,7 +396,7 @@ static ssize_t wakeup_abort_count_show(struct device *dev,
 	return enabled ? sprintf(buf, "%lu\n", count) : sprintf(buf, "\n");
 }
 
-static DEVICE_ATTR(wakeup_abort_count, 0444, wakeup_abort_count_show, NULL);
+static DEVICE_ATTR_RO(wakeup_abort_count);
 
 static ssize_t wakeup_expire_count_show(struct device *dev,
 					struct device_attribute *attr,
@@ -418,10 +414,10 @@ static ssize_t wakeup_expire_count_show(struct device *dev,
 	return enabled ? sprintf(buf, "%lu\n", count) : sprintf(buf, "\n");
 }
 
-static DEVICE_ATTR(wakeup_expire_count, 0444, wakeup_expire_count_show, NULL);
+static DEVICE_ATTR_RO(wakeup_expire_count);
 
 static ssize_t wakeup_active_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
+				  struct device_attribute *attr, char *buf)
 {
 	unsigned int active = 0;
 	bool enabled = false;
@@ -435,10 +431,11 @@ static ssize_t wakeup_active_show(struct device *dev,
 	return enabled ? sprintf(buf, "%u\n", active) : sprintf(buf, "\n");
 }
 
-static DEVICE_ATTR(wakeup_active, 0444, wakeup_active_show, NULL);
+static DEVICE_ATTR_RO(wakeup_active);
 
-static ssize_t wakeup_total_time_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
+static ssize_t wakeup_total_time_ms_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
 {
 	s64 msec = 0;
 	bool enabled = false;
@@ -452,10 +449,10 @@ static ssize_t wakeup_total_time_show(struct device *dev,
 	return enabled ? sprintf(buf, "%lld\n", msec) : sprintf(buf, "\n");
 }
 
-static DEVICE_ATTR(wakeup_total_time_ms, 0444, wakeup_total_time_show, NULL);
+static DEVICE_ATTR_RO(wakeup_total_time_ms);
 
-static ssize_t wakeup_max_time_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
+static ssize_t wakeup_max_time_ms_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
 {
 	s64 msec = 0;
 	bool enabled = false;
@@ -469,10 +466,11 @@ static ssize_t wakeup_max_time_show(struct device *dev,
 	return enabled ? sprintf(buf, "%lld\n", msec) : sprintf(buf, "\n");
 }
 
-static DEVICE_ATTR(wakeup_max_time_ms, 0444, wakeup_max_time_show, NULL);
+static DEVICE_ATTR_RO(wakeup_max_time_ms);
 
-static ssize_t wakeup_last_time_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
+static ssize_t wakeup_last_time_ms_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
 {
 	s64 msec = 0;
 	bool enabled = false;
@@ -486,12 +484,12 @@ static ssize_t wakeup_last_time_show(struct device *dev,
 	return enabled ? sprintf(buf, "%lld\n", msec) : sprintf(buf, "\n");
 }
 
-static DEVICE_ATTR(wakeup_last_time_ms, 0444, wakeup_last_time_show, NULL);
+static DEVICE_ATTR_RO(wakeup_last_time_ms);
 
 #ifdef CONFIG_PM_AUTOSLEEP
-static ssize_t wakeup_prevent_sleep_time_show(struct device *dev,
-					      struct device_attribute *attr,
-					      char *buf)
+static ssize_t wakeup_prevent_sleep_time_ms_show(struct device *dev,
+						 struct device_attribute *attr,
+						 char *buf)
 {
 	s64 msec = 0;
 	bool enabled = false;
@@ -505,27 +503,29 @@ static ssize_t wakeup_prevent_sleep_time_show(struct device *dev,
 	return enabled ? sprintf(buf, "%lld\n", msec) : sprintf(buf, "\n");
 }
 
-static DEVICE_ATTR(wakeup_prevent_sleep_time_ms, 0444,
-		   wakeup_prevent_sleep_time_show, NULL);
+static DEVICE_ATTR_RO(wakeup_prevent_sleep_time_ms);
 #endif /* CONFIG_PM_AUTOSLEEP */
 #endif /* CONFIG_PM_SLEEP */
 
 #ifdef CONFIG_PM_ADVANCED_DEBUG
-static ssize_t rtpm_usagecount_show(struct device *dev,
-				    struct device_attribute *attr, char *buf)
+static ssize_t runtime_usage_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%d\n", atomic_read(&dev->power.usage_count));
 }
+static DEVICE_ATTR_RO(runtime_usage);
 
-static ssize_t rtpm_children_show(struct device *dev,
-				  struct device_attribute *attr, char *buf)
+static ssize_t runtime_active_kids_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
 {
 	return sprintf(buf, "%d\n", dev->power.ignore_children ?
 		0 : atomic_read(&dev->power.child_count));
 }
+static DEVICE_ATTR_RO(runtime_active_kids);
 
-static ssize_t rtpm_enabled_show(struct device *dev,
-				 struct device_attribute *attr, char *buf)
+static ssize_t runtime_enabled_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
 {
 	if (dev->power.disable_depth && (dev->power.runtime_auto == false))
 		return sprintf(buf, "disabled & forbidden\n");
@@ -535,10 +535,7 @@ static ssize_t rtpm_enabled_show(struct device *dev,
 		return sprintf(buf, "forbidden\n");
 	return sprintf(buf, "enabled\n");
 }
-
-static DEVICE_ATTR(runtime_usage, 0444, rtpm_usagecount_show, NULL);
-static DEVICE_ATTR(runtime_active_kids, 0444, rtpm_children_show, NULL);
-static DEVICE_ATTR(runtime_enabled, 0444, rtpm_enabled_show, NULL);
+static DEVICE_ATTR_RO(runtime_enabled);
 
 #ifdef CONFIG_PM_SLEEP
 static ssize_t async_show(struct device *dev, struct device_attribute *attr,
@@ -561,7 +558,7 @@ static ssize_t async_store(struct device *dev, struct device_attribute *attr,
 	return n;
 }
 
-static DEVICE_ATTR(async, 0644, async_show, async_store);
+static DEVICE_ATTR_RW(async);
 
 #endif /* CONFIG_PM_SLEEP */
 #endif /* CONFIG_PM_ADVANCED_DEBUG */

From 1172ee31259b51a9b2d83b05f01161fd5938b15d Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 13 Nov 2017 16:46:41 +0100
Subject: [PATCH 13/89] PM / core: Re-factor some code dealing with parents in
 __device_suspend()

Let's make the code a bit more readable by moving some of the code, which
deals with adjustments for parent devices in __device_suspend(), into its
own function.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/main.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 73ec6796d9e1..c0d5f4a3611d 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1479,6 +1479,22 @@ static int legacy_suspend(struct device *dev, pm_message_t state,
 	return error;
 }
 
+static void dpm_propagate_to_parent(struct device *dev)
+{
+	struct device *parent = dev->parent;
+
+	if (!parent)
+		return;
+
+	spin_lock_irq(&parent->power.lock);
+
+	parent->power.direct_complete = false;
+	if (dev->power.wakeup_path && !parent->power.ignore_children)
+		parent->power.wakeup_path = true;
+
+	spin_unlock_irq(&parent->power.lock);
+}
+
 static void dpm_clear_suppliers_direct_complete(struct device *dev)
 {
 	struct device_link *link;
@@ -1590,19 +1606,8 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 
  End:
 	if (!error) {
-		struct device *parent = dev->parent;
-
 		dev->power.is_suspended = true;
-		if (parent) {
-			spin_lock_irq(&parent->power.lock);
-
-			dev->parent->power.direct_complete = false;
-			if (dev->power.wakeup_path
-			    && !dev->parent->power.ignore_children)
-				dev->parent->power.wakeup_path = true;
-
-			spin_unlock_irq(&parent->power.lock);
-		}
+		dpm_propagate_to_parent(dev);
 		dpm_clear_suppliers_direct_complete(dev);
 	}
 

From f5f263fed66f75a4482d7ad49392b4283a05885a Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 5 Dec 2017 11:02:43 +0530
Subject: [PATCH 14/89] cpu_cooling: Make of_cpufreq_power_cooling_register()
 parse DT

All the callers of of_cpufreq_power_cooling_register() have almost
identical code and it makes more sense to move that code into the helper
as its all about reading DT properties.

This got rid of lot of redundant code.

Acked-by: Eduardo Valentin <edubezval@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/thermal/cpu-cooling-api.txt |  7 ++--
 drivers/cpufreq/arm_big_little.c          | 23 +----------
 drivers/cpufreq/cpufreq-dt.c              | 27 +------------
 drivers/cpufreq/mediatek-cpufreq.c        | 22 +---------
 drivers/cpufreq/qoriq-cpufreq.c           | 14 +------
 drivers/thermal/cpu_cooling.c             | 49 ++++++++++++++---------
 include/linux/cpu_cooling.h               | 15 ++-----
 7 files changed, 41 insertions(+), 116 deletions(-)

diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt
index 71653584cd03..4f6f5e9bb4d6 100644
--- a/Documentation/thermal/cpu-cooling-api.txt
+++ b/Documentation/thermal/cpu-cooling-api.txt
@@ -51,8 +51,7 @@ Dynamic power).  "plat_static_func" is a function to calculate the
 static power consumed by these cpus (See 2.2 Static power).
 
 1.1.4 struct thermal_cooling_device *of_cpufreq_power_cooling_register(
-    struct device_node *np, const struct cpumask *clip_cpus, u32 capacitance,
-    get_static_t plat_static_func)
+					struct cpufreq_policy *policy)
 
 Similar to cpufreq_power_cooling_register, this function register a
 cpufreq cooling device with power extensions using the device tree
@@ -76,8 +75,8 @@ cpu.  If you are using CONFIG_CPUFREQ_DT then the
 device.
 
 The `plat_static_func` parameter of `cpufreq_power_cooling_register()`
-and `of_cpufreq_power_cooling_register()` is optional.  If you don't
-provide it, only dynamic power will be considered.
+is optional.  If you don't provide it, only dynamic power will be
+considered.
 
 2.1 Dynamic power
 
diff --git a/drivers/cpufreq/arm_big_little.c b/drivers/cpufreq/arm_big_little.c
index 65ec5f01aa8d..3d5ed4ef3927 100644
--- a/drivers/cpufreq/arm_big_little.c
+++ b/drivers/cpufreq/arm_big_little.c
@@ -526,34 +526,13 @@ static int bL_cpufreq_exit(struct cpufreq_policy *policy)
 
 static void bL_cpufreq_ready(struct cpufreq_policy *policy)
 {
-	struct device *cpu_dev = get_cpu_device(policy->cpu);
 	int cur_cluster = cpu_to_cluster(policy->cpu);
-	struct device_node *np;
 
 	/* Do not register a cpu_cooling device if we are in IKS mode */
 	if (cur_cluster >= MAX_CLUSTERS)
 		return;
 
-	np = of_node_get(cpu_dev->of_node);
-	if (WARN_ON(!np))
-		return;
-
-	if (of_find_property(np, "#cooling-cells", NULL)) {
-		u32 power_coefficient = 0;
-
-		of_property_read_u32(np, "dynamic-power-coefficient",
-				     &power_coefficient);
-
-		cdev[cur_cluster] = of_cpufreq_power_cooling_register(np,
-				policy, power_coefficient, NULL);
-		if (IS_ERR(cdev[cur_cluster])) {
-			dev_err(cpu_dev,
-				"running cpufreq without cooling device: %ld\n",
-				PTR_ERR(cdev[cur_cluster]));
-			cdev[cur_cluster] = NULL;
-		}
-	}
-	of_node_put(np);
+	cdev[cur_cluster] = of_cpufreq_power_cooling_register(policy);
 }
 
 static struct cpufreq_driver bL_cpufreq_driver = {
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 545946ad0752..1e7bec7694ab 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -319,33 +319,8 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
 static void cpufreq_ready(struct cpufreq_policy *policy)
 {
 	struct private_data *priv = policy->driver_data;
-	struct device_node *np = of_node_get(priv->cpu_dev->of_node);
 
-	if (WARN_ON(!np))
-		return;
-
-	/*
-	 * For now, just loading the cooling device;
-	 * thermal DT code takes care of matching them.
-	 */
-	if (of_find_property(np, "#cooling-cells", NULL)) {
-		u32 power_coefficient = 0;
-
-		of_property_read_u32(np, "dynamic-power-coefficient",
-				     &power_coefficient);
-
-		priv->cdev = of_cpufreq_power_cooling_register(np,
-				policy, power_coefficient, NULL);
-		if (IS_ERR(priv->cdev)) {
-			dev_err(priv->cpu_dev,
-				"running cpufreq without cooling device: %ld\n",
-				PTR_ERR(priv->cdev));
-
-			priv->cdev = NULL;
-		}
-	}
-
-	of_node_put(np);
+	priv->cdev = of_cpufreq_power_cooling_register(policy);
 }
 
 static struct cpufreq_driver dt_cpufreq_driver = {
diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c
index e0d5090b303d..6ff783e1b18a 100644
--- a/drivers/cpufreq/mediatek-cpufreq.c
+++ b/drivers/cpufreq/mediatek-cpufreq.c
@@ -310,28 +310,8 @@ static int mtk_cpufreq_set_target(struct cpufreq_policy *policy,
 static void mtk_cpufreq_ready(struct cpufreq_policy *policy)
 {
 	struct mtk_cpu_dvfs_info *info = policy->driver_data;
-	struct device_node *np = of_node_get(info->cpu_dev->of_node);
-	u32 capacitance = 0;
 
-	if (WARN_ON(!np))
-		return;
-
-	if (of_find_property(np, "#cooling-cells", NULL)) {
-		of_property_read_u32(np, DYNAMIC_POWER, &capacitance);
-
-		info->cdev = of_cpufreq_power_cooling_register(np,
-						policy, capacitance, NULL);
-
-		if (IS_ERR(info->cdev)) {
-			dev_err(info->cpu_dev,
-				"running cpufreq without cooling device: %ld\n",
-				PTR_ERR(info->cdev));
-
-			info->cdev = NULL;
-		}
-	}
-
-	of_node_put(np);
+	info->cdev = of_cpufreq_power_cooling_register(policy);
 }
 
 static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu)
diff --git a/drivers/cpufreq/qoriq-cpufreq.c b/drivers/cpufreq/qoriq-cpufreq.c
index 4ada55b8856e..3a665c18e14e 100644
--- a/drivers/cpufreq/qoriq-cpufreq.c
+++ b/drivers/cpufreq/qoriq-cpufreq.c
@@ -275,20 +275,8 @@ static int qoriq_cpufreq_target(struct cpufreq_policy *policy,
 static void qoriq_cpufreq_ready(struct cpufreq_policy *policy)
 {
 	struct cpu_data *cpud = policy->driver_data;
-	struct device_node *np = of_get_cpu_node(policy->cpu, NULL);
 
-	if (of_find_property(np, "#cooling-cells", NULL)) {
-		cpud->cdev = of_cpufreq_cooling_register(np, policy);
-
-		if (IS_ERR(cpud->cdev) && PTR_ERR(cpud->cdev) != -ENOSYS) {
-			pr_err("cpu%d is not running as cooling device: %ld\n",
-					policy->cpu, PTR_ERR(cpud->cdev));
-
-			cpud->cdev = NULL;
-		}
-	}
-
-	of_node_put(np);
+	cpud->cdev = of_cpufreq_power_cooling_register(policy);
 }
 
 static struct cpufreq_driver qoriq_cpufreq_driver = {
diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index dc63aba092e4..a31eb03c788e 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -873,38 +873,51 @@ EXPORT_SYMBOL(cpufreq_power_cooling_register);
 
 /**
  * of_cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
- * @np:	a valid struct device_node to the cooling device device tree node
- * @policy: cpufreq policy
- * @capacitance:	dynamic power coefficient for these cpus
- * @plat_static_func:	function to calculate the static power consumed by these
- *			cpus (optional)
+ * @policy: CPUFreq policy.
  *
  * This interface function registers the cpufreq cooling device with
  * the name "thermal-cpufreq-%x".  This api can support multiple
  * instances of cpufreq cooling devices.  Using this API, the cpufreq
- * cooling device will be linked to the device tree node provided.
+ * cooling device will be linked to the device tree node of the provided
+ * policy's CPU.
  * Using this function, the cooling device will implement the power
  * extensions by using a simple cpu power model.  The cpus must have
  * registered their OPPs using the OPP library.
  *
- * An optional @plat_static_func may be provided to calculate the
- * static power consumed by these cpus.  If the platform's static
- * power consumption is unknown or negligible, make it NULL.
+ * It also takes into account, if property present in policy CPU node, the
+ * static power consumed by the cpu.
  *
  * Return: a valid struct thermal_cooling_device pointer on success,
- * on failure, it returns a corresponding ERR_PTR().
+ * and NULL on failure.
  */
 struct thermal_cooling_device *
-of_cpufreq_power_cooling_register(struct device_node *np,
-				  struct cpufreq_policy *policy,
-				  u32 capacitance,
-				  get_static_t plat_static_func)
+of_cpufreq_power_cooling_register(struct cpufreq_policy *policy)
 {
-	if (!np)
-		return ERR_PTR(-EINVAL);
+	struct device_node *np = of_get_cpu_node(policy->cpu, NULL);
+	struct thermal_cooling_device *cdev = NULL;
+	u32 capacitance = 0;
 
-	return __cpufreq_cooling_register(np, policy, capacitance,
-				plat_static_func);
+	if (!np) {
+		pr_err("cpu_cooling: OF node not available for cpu%d\n",
+		       policy->cpu);
+		return NULL;
+	}
+
+	if (of_find_property(np, "#cooling-cells", NULL)) {
+		of_property_read_u32(np, "dynamic-power-coefficient",
+				     &capacitance);
+
+		cdev = __cpufreq_cooling_register(np, policy, capacitance,
+						  NULL);
+		if (IS_ERR(cdev)) {
+			pr_err("cpu_cooling: cpu%d is not running as cooling device: %ld\n",
+			       policy->cpu, PTR_ERR(cdev));
+			cdev = NULL;
+		}
+	}
+
+	of_node_put(np);
+	return cdev;
 }
 EXPORT_SYMBOL(of_cpufreq_power_cooling_register);
 
diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index d4292ebc5c8b..f09d4feb34f4 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -56,10 +56,7 @@ of_cpufreq_cooling_register(struct device_node *np,
 			    struct cpufreq_policy *policy);
 
 struct thermal_cooling_device *
-of_cpufreq_power_cooling_register(struct device_node *np,
-				  struct cpufreq_policy *policy,
-				  u32 capacitance,
-				  get_static_t plat_static_func);
+of_cpufreq_power_cooling_register(struct cpufreq_policy *policy);
 #else
 static inline struct thermal_cooling_device *
 of_cpufreq_cooling_register(struct device_node *np,
@@ -69,10 +66,7 @@ of_cpufreq_cooling_register(struct device_node *np,
 }
 
 static inline struct thermal_cooling_device *
-of_cpufreq_power_cooling_register(struct device_node *np,
-				  struct cpufreq_policy *policy,
-				  u32 capacitance,
-				  get_static_t plat_static_func)
+of_cpufreq_power_cooling_register(struct cpufreq_policy *policy)
 {
 	return NULL;
 }
@@ -105,10 +99,7 @@ of_cpufreq_cooling_register(struct device_node *np,
 }
 
 static inline struct thermal_cooling_device *
-of_cpufreq_power_cooling_register(struct device_node *np,
-				  struct cpufreq_policy *policy,
-				  u32 capacitance,
-				  get_static_t plat_static_func)
+of_cpufreq_power_cooling_register(struct cpufreq_policy *policy)
 {
 	return NULL;
 }

From ba0966da208ef0793486502a0e6b929fbd6d4223 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 5 Dec 2017 11:02:44 +0530
Subject: [PATCH 15/89] cpu_cooling: Remove unused
 cpufreq_power_cooling_register()

It isn't used by anyone, drop it.

Acked-by: Eduardo Valentin <edubezval@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/thermal/cpu-cooling-api.txt | 24 +++---------------
 drivers/thermal/cpu_cooling.c             | 30 -----------------------
 include/linux/cpu_cooling.h               | 10 --------
 3 files changed, 3 insertions(+), 61 deletions(-)

diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt
index 4f6f5e9bb4d6..ea61e8bf7e2b 100644
--- a/Documentation/thermal/cpu-cooling-api.txt
+++ b/Documentation/thermal/cpu-cooling-api.txt
@@ -36,28 +36,14 @@ the user. The registration APIs returns the cooling device pointer.
     np: pointer to the cooling device device tree node
     clip_cpus: cpumask of cpus where the frequency constraints will happen.
 
-1.1.3 struct thermal_cooling_device *cpufreq_power_cooling_register(
-    const struct cpumask *clip_cpus, u32 capacitance,
-    get_static_t plat_static_func)
-
-Similar to cpufreq_cooling_register, this function registers a cpufreq
-cooling device.  Using this function, the cooling device will
-implement the power extensions by using a simple cpu power model.  The
-cpus must have registered their OPPs using the OPP library.
-
-The additional parameters are needed for the power model (See 2. Power
-models).  "capacitance" is the dynamic power coefficient (See 2.1
-Dynamic power).  "plat_static_func" is a function to calculate the
-static power consumed by these cpus (See 2.2 Static power).
-
-1.1.4 struct thermal_cooling_device *of_cpufreq_power_cooling_register(
+1.1.3 struct thermal_cooling_device *of_cpufreq_power_cooling_register(
 					struct cpufreq_policy *policy)
 
-Similar to cpufreq_power_cooling_register, this function register a
+Similar to cpufreq_cooling_register, this function register a
 cpufreq cooling device with power extensions using the device tree
 information supplied by the np parameter.
 
-1.1.5 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
+1.1.4 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 
     This interface function unregisters the "thermal-cpufreq-%x" cooling device.
 
@@ -74,10 +60,6 @@ cpu.  If you are using CONFIG_CPUFREQ_DT then the
 `cpufreq_frequency_table` should already be assigned to the cpu
 device.
 
-The `plat_static_func` parameter of `cpufreq_power_cooling_register()`
-is optional.  If you don't provide it, only dynamic power will be
-considered.
-
 2.1 Dynamic power
 
 The dynamic power consumption of a processor depends on many factors.
diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index a31eb03c788e..10199f7e1196 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -841,36 +841,6 @@ of_cpufreq_cooling_register(struct device_node *np,
 }
 EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register);
 
-/**
- * cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
- * @policy:		cpufreq policy
- * @capacitance:	dynamic power coefficient for these cpus
- * @plat_static_func:	function to calculate the static power consumed by these
- *			cpus (optional)
- *
- * This interface function registers the cpufreq cooling device with
- * the name "thermal-cpufreq-%x".  This api can support multiple
- * instances of cpufreq cooling devices.  Using this function, the
- * cooling device will implement the power extensions by using a
- * simple cpu power model.  The cpus must have registered their OPPs
- * using the OPP library.
- *
- * An optional @plat_static_func may be provided to calculate the
- * static power consumed by these cpus.  If the platform's static
- * power consumption is unknown or negligible, make it NULL.
- *
- * Return: a valid struct thermal_cooling_device pointer on success,
- * on failure, it returns a corresponding ERR_PTR().
- */
-struct thermal_cooling_device *
-cpufreq_power_cooling_register(struct cpufreq_policy *policy, u32 capacitance,
-			       get_static_t plat_static_func)
-{
-	return __cpufreq_cooling_register(NULL, policy, capacitance,
-				plat_static_func);
-}
-EXPORT_SYMBOL(cpufreq_power_cooling_register);
-
 /**
  * of_cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
  * @policy: CPUFreq policy.
diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index f09d4feb34f4..c35778960a9c 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -41,10 +41,6 @@ typedef int (*get_static_t)(cpumask_t *cpumask, int interval,
 struct thermal_cooling_device *
 cpufreq_cooling_register(struct cpufreq_policy *policy);
 
-struct thermal_cooling_device *
-cpufreq_power_cooling_register(struct cpufreq_policy *policy,
-			       u32 capacitance, get_static_t plat_static_func);
-
 /**
  * of_cpufreq_cooling_register - create cpufreq cooling device based on DT.
  * @np: a valid struct device_node to the cooling device device tree node.
@@ -84,12 +80,6 @@ cpufreq_cooling_register(struct cpufreq_policy *policy)
 {
 	return ERR_PTR(-ENOSYS);
 }
-static inline struct thermal_cooling_device *
-cpufreq_power_cooling_register(struct cpufreq_policy *policy,
-			       u32 capacitance, get_static_t plat_static_func)
-{
-	return NULL;
-}
 
 static inline struct thermal_cooling_device *
 of_cpufreq_cooling_register(struct device_node *np,

From 3ebb62ffc4e4817c3288cdf7ed67ccaf453385e3 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 5 Dec 2017 11:02:45 +0530
Subject: [PATCH 16/89] cpu_cooling: Keep only one
 of_cpufreq*cooling_register() helper

of_cpufreq_cooling_register() isn't used by anyone and so can be
removed, but then we would be left with two routines:
cpufreq_cooling_register() and of_cpufreq_power_cooling_register() that
would look odd.

Remove current implementation of of_cpufreq_cooling_register() and
rename of_cpufreq_power_cooling_register() as
of_cpufreq_cooling_register(). This simplifies lots of stuff.

Acked-by: Eduardo Valentin <edubezval@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/thermal/cpu-cooling-api.txt | 14 ++----
 drivers/cpufreq/arm_big_little.c          |  2 +-
 drivers/cpufreq/cpufreq-dt.c              |  2 +-
 drivers/cpufreq/mediatek-cpufreq.c        |  2 +-
 drivers/cpufreq/qoriq-cpufreq.c           |  2 +-
 drivers/thermal/cpu_cooling.c             | 28 +-----------
 include/linux/cpu_cooling.h               | 55 +++++++----------------
 7 files changed, 24 insertions(+), 81 deletions(-)

diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt
index ea61e8bf7e2b..7a1c89db0419 100644
--- a/Documentation/thermal/cpu-cooling-api.txt
+++ b/Documentation/thermal/cpu-cooling-api.txt
@@ -26,24 +26,16 @@ the user. The registration APIs returns the cooling device pointer.
    clip_cpus: cpumask of cpus where the frequency constraints will happen.
 
 1.1.2 struct thermal_cooling_device *of_cpufreq_cooling_register(
-	struct device_node *np, const struct cpumask *clip_cpus)
+					struct cpufreq_policy *policy)
 
     This interface function registers the cpufreq cooling device with
     the name "thermal-cpufreq-%x" linking it with a device tree node, in
     order to bind it via the thermal DT code. This api can support multiple
     instances of cpufreq cooling devices.
 
-    np: pointer to the cooling device device tree node
-    clip_cpus: cpumask of cpus where the frequency constraints will happen.
+    policy: CPUFreq policy.
 
-1.1.3 struct thermal_cooling_device *of_cpufreq_power_cooling_register(
-					struct cpufreq_policy *policy)
-
-Similar to cpufreq_cooling_register, this function register a
-cpufreq cooling device with power extensions using the device tree
-information supplied by the np parameter.
-
-1.1.4 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
+1.1.3 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 
     This interface function unregisters the "thermal-cpufreq-%x" cooling device.
 
diff --git a/drivers/cpufreq/arm_big_little.c b/drivers/cpufreq/arm_big_little.c
index 3d5ed4ef3927..c56b57dcfda5 100644
--- a/drivers/cpufreq/arm_big_little.c
+++ b/drivers/cpufreq/arm_big_little.c
@@ -532,7 +532,7 @@ static void bL_cpufreq_ready(struct cpufreq_policy *policy)
 	if (cur_cluster >= MAX_CLUSTERS)
 		return;
 
-	cdev[cur_cluster] = of_cpufreq_power_cooling_register(policy);
+	cdev[cur_cluster] = of_cpufreq_cooling_register(policy);
 }
 
 static struct cpufreq_driver bL_cpufreq_driver = {
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 1e7bec7694ab..de3d104c25d7 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -320,7 +320,7 @@ static void cpufreq_ready(struct cpufreq_policy *policy)
 {
 	struct private_data *priv = policy->driver_data;
 
-	priv->cdev = of_cpufreq_power_cooling_register(policy);
+	priv->cdev = of_cpufreq_cooling_register(policy);
 }
 
 static struct cpufreq_driver dt_cpufreq_driver = {
diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c
index 6ff783e1b18a..f95975b76d98 100644
--- a/drivers/cpufreq/mediatek-cpufreq.c
+++ b/drivers/cpufreq/mediatek-cpufreq.c
@@ -311,7 +311,7 @@ static void mtk_cpufreq_ready(struct cpufreq_policy *policy)
 {
 	struct mtk_cpu_dvfs_info *info = policy->driver_data;
 
-	info->cdev = of_cpufreq_power_cooling_register(policy);
+	info->cdev = of_cpufreq_cooling_register(policy);
 }
 
 static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu)
diff --git a/drivers/cpufreq/qoriq-cpufreq.c b/drivers/cpufreq/qoriq-cpufreq.c
index 3a665c18e14e..0562761a3dec 100644
--- a/drivers/cpufreq/qoriq-cpufreq.c
+++ b/drivers/cpufreq/qoriq-cpufreq.c
@@ -276,7 +276,7 @@ static void qoriq_cpufreq_ready(struct cpufreq_policy *policy)
 {
 	struct cpu_data *cpud = policy->driver_data;
 
-	cpud->cdev = of_cpufreq_power_cooling_register(policy);
+	cpud->cdev = of_cpufreq_cooling_register(policy);
 }
 
 static struct cpufreq_driver qoriq_cpufreq_driver = {
diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index 10199f7e1196..3371caf3095c 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -819,7 +819,6 @@ EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
 
 /**
  * of_cpufreq_cooling_register - function to create cpufreq cooling device.
- * @np: a valid struct device_node to the cooling device device tree node
  * @policy: cpufreq policy
  *
  * This interface function registers the cpufreq cooling device with the name
@@ -827,29 +826,6 @@ EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
  * cooling devices. Using this API, the cpufreq cooling device will be
  * linked to the device tree node provided.
  *
- * Return: a valid struct thermal_cooling_device pointer on success,
- * on failure, it returns a corresponding ERR_PTR().
- */
-struct thermal_cooling_device *
-of_cpufreq_cooling_register(struct device_node *np,
-			    struct cpufreq_policy *policy)
-{
-	if (!np)
-		return ERR_PTR(-EINVAL);
-
-	return __cpufreq_cooling_register(np, policy, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register);
-
-/**
- * of_cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
- * @policy: CPUFreq policy.
- *
- * This interface function registers the cpufreq cooling device with
- * the name "thermal-cpufreq-%x".  This api can support multiple
- * instances of cpufreq cooling devices.  Using this API, the cpufreq
- * cooling device will be linked to the device tree node of the provided
- * policy's CPU.
  * Using this function, the cooling device will implement the power
  * extensions by using a simple cpu power model.  The cpus must have
  * registered their OPPs using the OPP library.
@@ -861,7 +837,7 @@ EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register);
  * and NULL on failure.
  */
 struct thermal_cooling_device *
-of_cpufreq_power_cooling_register(struct cpufreq_policy *policy)
+of_cpufreq_cooling_register(struct cpufreq_policy *policy)
 {
 	struct device_node *np = of_get_cpu_node(policy->cpu, NULL);
 	struct thermal_cooling_device *cdev = NULL;
@@ -889,7 +865,7 @@ of_cpufreq_power_cooling_register(struct cpufreq_policy *policy)
 	of_node_put(np);
 	return cdev;
 }
-EXPORT_SYMBOL(of_cpufreq_power_cooling_register);
+EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register);
 
 /**
  * cpufreq_cooling_unregister - function to remove cpufreq cooling device.
diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index c35778960a9c..fd0ea8ddca93 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -41,33 +41,6 @@ typedef int (*get_static_t)(cpumask_t *cpumask, int interval,
 struct thermal_cooling_device *
 cpufreq_cooling_register(struct cpufreq_policy *policy);
 
-/**
- * of_cpufreq_cooling_register - create cpufreq cooling device based on DT.
- * @np: a valid struct device_node to the cooling device device tree node.
- * @policy: cpufreq policy.
- */
-#ifdef CONFIG_THERMAL_OF
-struct thermal_cooling_device *
-of_cpufreq_cooling_register(struct device_node *np,
-			    struct cpufreq_policy *policy);
-
-struct thermal_cooling_device *
-of_cpufreq_power_cooling_register(struct cpufreq_policy *policy);
-#else
-static inline struct thermal_cooling_device *
-of_cpufreq_cooling_register(struct device_node *np,
-			    struct cpufreq_policy *policy)
-{
-	return ERR_PTR(-ENOSYS);
-}
-
-static inline struct thermal_cooling_device *
-of_cpufreq_power_cooling_register(struct cpufreq_policy *policy)
-{
-	return NULL;
-}
-#endif
-
 /**
  * cpufreq_cooling_unregister - function to remove cpufreq cooling device.
  * @cdev: thermal cooling device pointer.
@@ -81,19 +54,6 @@ cpufreq_cooling_register(struct cpufreq_policy *policy)
 	return ERR_PTR(-ENOSYS);
 }
 
-static inline struct thermal_cooling_device *
-of_cpufreq_cooling_register(struct device_node *np,
-			    struct cpufreq_policy *policy)
-{
-	return ERR_PTR(-ENOSYS);
-}
-
-static inline struct thermal_cooling_device *
-of_cpufreq_power_cooling_register(struct cpufreq_policy *policy)
-{
-	return NULL;
-}
-
 static inline
 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 {
@@ -101,4 +61,19 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 }
 #endif	/* CONFIG_CPU_THERMAL */
 
+#if defined(CONFIG_THERMAL_OF) && defined(CONFIG_CPU_THERMAL)
+/**
+ * of_cpufreq_cooling_register - create cpufreq cooling device based on DT.
+ * @policy: cpufreq policy.
+ */
+struct thermal_cooling_device *
+of_cpufreq_cooling_register(struct cpufreq_policy *policy);
+#else
+static inline struct thermal_cooling_device *
+of_cpufreq_cooling_register(struct cpufreq_policy *policy)
+{
+	return NULL;
+}
+#endif /* defined(CONFIG_THERMAL_OF) && defined(CONFIG_CPU_THERMAL) */
+
 #endif /* __CPU_COOLING_H__ */

From 84fe2cab48590e4373978e4ef2031c977de98995 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 5 Dec 2017 11:02:46 +0530
Subject: [PATCH 17/89] cpu_cooling: Drop static-power related stuff

No one has used it for the last two and half years (since it was
introduced by commit c36cf0717631 (thermal: cpu_cooling: implement the
power cooling device API), get rid of it.

Acked-by: Eduardo Valentin <edubezval@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/cpu_cooling.c  | 106 ++++-----------------------------
 include/linux/cpu_cooling.h    |   3 -
 include/trace/events/thermal.h |  10 ++--
 3 files changed, 16 insertions(+), 103 deletions(-)

diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index 3371caf3095c..dfd23245f778 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -88,7 +88,6 @@ struct time_in_idle {
  * @policy: cpufreq policy.
  * @node: list_head to link all cpufreq_cooling_device together.
  * @idle_time: idle time stats
- * @plat_get_static_power: callback to calculate the static power
  *
  * This structure is required for keeping information of each registered
  * cpufreq_cooling_device.
@@ -104,7 +103,6 @@ struct cpufreq_cooling_device {
 	struct cpufreq_policy *policy;
 	struct list_head node;
 	struct time_in_idle *idle_time;
-	get_static_t plat_get_static_power;
 };
 
 static DEFINE_IDA(cpufreq_ida);
@@ -318,60 +316,6 @@ static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
 	return load;
 }
 
-/**
- * get_static_power() - calculate the static power consumed by the cpus
- * @cpufreq_cdev:	struct &cpufreq_cooling_device for this cpu cdev
- * @tz:		thermal zone device in which we're operating
- * @freq:	frequency in KHz
- * @power:	pointer in which to store the calculated static power
- *
- * Calculate the static power consumed by the cpus described by
- * @cpu_actor running at frequency @freq.  This function relies on a
- * platform specific function that should have been provided when the
- * actor was registered.  If it wasn't, the static power is assumed to
- * be negligible.  The calculated static power is stored in @power.
- *
- * Return: 0 on success, -E* on failure.
- */
-static int get_static_power(struct cpufreq_cooling_device *cpufreq_cdev,
-			    struct thermal_zone_device *tz, unsigned long freq,
-			    u32 *power)
-{
-	struct dev_pm_opp *opp;
-	unsigned long voltage;
-	struct cpufreq_policy *policy = cpufreq_cdev->policy;
-	struct cpumask *cpumask = policy->related_cpus;
-	unsigned long freq_hz = freq * 1000;
-	struct device *dev;
-
-	if (!cpufreq_cdev->plat_get_static_power) {
-		*power = 0;
-		return 0;
-	}
-
-	dev = get_cpu_device(policy->cpu);
-	WARN_ON(!dev);
-
-	opp = dev_pm_opp_find_freq_exact(dev, freq_hz, true);
-	if (IS_ERR(opp)) {
-		dev_warn_ratelimited(dev, "Failed to find OPP for frequency %lu: %ld\n",
-				     freq_hz, PTR_ERR(opp));
-		return -EINVAL;
-	}
-
-	voltage = dev_pm_opp_get_voltage(opp);
-	dev_pm_opp_put(opp);
-
-	if (voltage == 0) {
-		dev_err_ratelimited(dev, "Failed to get voltage for frequency %lu\n",
-				    freq_hz);
-		return -EINVAL;
-	}
-
-	return cpufreq_cdev->plat_get_static_power(cpumask, tz->passive_delay,
-						  voltage, power);
-}
-
 /**
  * get_dynamic_power() - calculate the dynamic power
  * @cpufreq_cdev:	&cpufreq_cooling_device for this cdev
@@ -491,8 +435,8 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
 				       u32 *power)
 {
 	unsigned long freq;
-	int i = 0, cpu, ret;
-	u32 static_power, dynamic_power, total_load = 0;
+	int i = 0, cpu;
+	u32 total_load = 0;
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 	struct cpufreq_policy *policy = cpufreq_cdev->policy;
 	u32 *load_cpu = NULL;
@@ -522,22 +466,15 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
 
 	cpufreq_cdev->last_load = total_load;
 
-	dynamic_power = get_dynamic_power(cpufreq_cdev, freq);
-	ret = get_static_power(cpufreq_cdev, tz, freq, &static_power);
-	if (ret) {
-		kfree(load_cpu);
-		return ret;
-	}
+	*power = get_dynamic_power(cpufreq_cdev, freq);
 
 	if (load_cpu) {
 		trace_thermal_power_cpu_get_power(policy->related_cpus, freq,
-						  load_cpu, i, dynamic_power,
-						  static_power);
+						  load_cpu, i, *power);
 
 		kfree(load_cpu);
 	}
 
-	*power = static_power + dynamic_power;
 	return 0;
 }
 
@@ -561,8 +498,6 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,
 			       unsigned long state, u32 *power)
 {
 	unsigned int freq, num_cpus;
-	u32 static_power, dynamic_power;
-	int ret;
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 
 	/* Request state should be less than max_level */
@@ -572,13 +507,9 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,
 	num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus);
 
 	freq = cpufreq_cdev->freq_table[state].frequency;
-	dynamic_power = cpu_freq_to_power(cpufreq_cdev, freq) * num_cpus;
-	ret = get_static_power(cpufreq_cdev, tz, freq, &static_power);
-	if (ret)
-		return ret;
+	*power = cpu_freq_to_power(cpufreq_cdev, freq) * num_cpus;
 
-	*power = static_power + dynamic_power;
-	return ret;
+	return 0;
 }
 
 /**
@@ -606,21 +537,14 @@ static int cpufreq_power2state(struct thermal_cooling_device *cdev,
 			       unsigned long *state)
 {
 	unsigned int cur_freq, target_freq;
-	int ret;
-	s32 dyn_power;
-	u32 last_load, normalised_power, static_power;
+	u32 last_load, normalised_power;
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 	struct cpufreq_policy *policy = cpufreq_cdev->policy;
 
 	cur_freq = cpufreq_quick_get(policy->cpu);
-	ret = get_static_power(cpufreq_cdev, tz, cur_freq, &static_power);
-	if (ret)
-		return ret;
-
-	dyn_power = power - static_power;
-	dyn_power = dyn_power > 0 ? dyn_power : 0;
+	power = power > 0 ? power : 0;
 	last_load = cpufreq_cdev->last_load ?: 1;
-	normalised_power = (dyn_power * 100) / last_load;
+	normalised_power = (power * 100) / last_load;
 	target_freq = cpu_power_to_freq(cpufreq_cdev, normalised_power);
 
 	*state = get_level(cpufreq_cdev, target_freq);
@@ -671,8 +595,6 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table,
  * @policy: cpufreq policy
  * Normally this should be same as cpufreq policy->related_cpus.
  * @capacitance: dynamic power coefficient for these cpus
- * @plat_static_func: function to calculate the static power consumed by these
- *                    cpus (optional)
  *
  * This interface function registers the cpufreq cooling device with the name
  * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
@@ -684,8 +606,7 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table,
  */
 static struct thermal_cooling_device *
 __cpufreq_cooling_register(struct device_node *np,
-			struct cpufreq_policy *policy, u32 capacitance,
-			get_static_t plat_static_func)
+			struct cpufreq_policy *policy, u32 capacitance)
 {
 	struct thermal_cooling_device *cdev;
 	struct cpufreq_cooling_device *cpufreq_cdev;
@@ -755,8 +676,6 @@ __cpufreq_cooling_register(struct device_node *np,
 	}
 
 	if (capacitance) {
-		cpufreq_cdev->plat_get_static_power = plat_static_func;
-
 		ret = update_freq_table(cpufreq_cdev, capacitance);
 		if (ret) {
 			cdev = ERR_PTR(ret);
@@ -813,7 +732,7 @@ free_cdev:
 struct thermal_cooling_device *
 cpufreq_cooling_register(struct cpufreq_policy *policy)
 {
-	return __cpufreq_cooling_register(NULL, policy, 0, NULL);
+	return __cpufreq_cooling_register(NULL, policy, 0);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
 
@@ -853,8 +772,7 @@ of_cpufreq_cooling_register(struct cpufreq_policy *policy)
 		of_property_read_u32(np, "dynamic-power-coefficient",
 				     &capacitance);
 
-		cdev = __cpufreq_cooling_register(np, policy, capacitance,
-						  NULL);
+		cdev = __cpufreq_cooling_register(np, policy, capacitance);
 		if (IS_ERR(cdev)) {
 			pr_err("cpu_cooling: cpu%d is not running as cooling device: %ld\n",
 			       policy->cpu, PTR_ERR(cdev));
diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index fd0ea8ddca93..de0dafb9399d 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -30,9 +30,6 @@
 
 struct cpufreq_policy;
 
-typedef int (*get_static_t)(cpumask_t *cpumask, int interval,
-			    unsigned long voltage, u32 *power);
-
 #ifdef CONFIG_CPU_THERMAL
 /**
  * cpufreq_cooling_register - function to create cpufreq cooling device.
diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h
index 78946640fe03..135e5421f003 100644
--- a/include/trace/events/thermal.h
+++ b/include/trace/events/thermal.h
@@ -94,9 +94,9 @@ TRACE_EVENT(thermal_zone_trip,
 #ifdef CONFIG_CPU_THERMAL
 TRACE_EVENT(thermal_power_cpu_get_power,
 	TP_PROTO(const struct cpumask *cpus, unsigned long freq, u32 *load,
-		size_t load_len, u32 dynamic_power, u32 static_power),
+		size_t load_len, u32 dynamic_power),
 
-	TP_ARGS(cpus, freq, load, load_len, dynamic_power, static_power),
+	TP_ARGS(cpus, freq, load, load_len, dynamic_power),
 
 	TP_STRUCT__entry(
 		__bitmask(cpumask, num_possible_cpus())
@@ -104,7 +104,6 @@ TRACE_EVENT(thermal_power_cpu_get_power,
 		__dynamic_array(u32,   load, load_len)
 		__field(size_t,        load_len      )
 		__field(u32,           dynamic_power )
-		__field(u32,           static_power  )
 	),
 
 	TP_fast_assign(
@@ -115,13 +114,12 @@ TRACE_EVENT(thermal_power_cpu_get_power,
 			load_len * sizeof(*load));
 		__entry->load_len = load_len;
 		__entry->dynamic_power = dynamic_power;
-		__entry->static_power = static_power;
 	),
 
-	TP_printk("cpus=%s freq=%lu load={%s} dynamic_power=%d static_power=%d",
+	TP_printk("cpus=%s freq=%lu load={%s} dynamic_power=%d",
 		__get_bitmask(cpumask), __entry->freq,
 		__print_array(__get_dynamic_array(load), __entry->load_len, 4),
-		__entry->dynamic_power, __entry->static_power)
+		__entry->dynamic_power)
 );
 
 TRACE_EVENT(thermal_power_cpu_limit,

From 7f6344896d3e5be3ccfef1c5c98ef24940e5f229 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Sat, 11 Nov 2017 23:17:26 +0530
Subject: [PATCH 18/89] powercap: Simplify powercap_init()

Simplify powercap_init() by reducing the number of redundant
assignments in it.

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
[ rjw: Subject+changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/powercap_sys.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/powercap/powercap_sys.c b/drivers/powercap/powercap_sys.c
index 5b10b50f8686..64b2b2501a79 100644
--- a/drivers/powercap/powercap_sys.c
+++ b/drivers/powercap/powercap_sys.c
@@ -673,15 +673,13 @@ EXPORT_SYMBOL_GPL(powercap_unregister_control_type);
 
 static int __init powercap_init(void)
 {
-	int result = 0;
+	int result;
 
 	result = seed_constraint_attributes();
 	if (result)
 		return result;
 
-	result = class_register(&powercap_class);
-
-	return result;
+	return class_register(&powercap_class);
 }
 
 device_initcall(powercap_init);

From 001d50c9a14f1fab94b329161a1db9235b4e60da Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 30 Nov 2017 12:54:28 +0100
Subject: [PATCH 19/89] PM / Domains: Remove obsolete "samsung,power-domain"
 check

Currently the generic PM Domain code code checks for the presence of
both (generic) "power-domains" and (Samsung Exynos legacy)
"samsung,power-domain" properties in all device tree nodes representing
devices.

There are two issues with this:
  1. This imposes a small boot-time penalty on all platforms using DT,
  2. Platform-specific checks do not really belong in core framework
     code.

Remove the platform-specific check, as the last user of
"samsung,power-domain" was removed in commit 46dcf0ff0de35da8 ("ARM:
dts: exynos: Remove exynos4415.dtsi").  All other users were converted
before in commit 0da6587041363033 ("ARM: dts: convert to generic power
domain bindings for exynos DT").

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 0c80bea05bcb..f9dcc981b6b9 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2199,20 +2199,8 @@ int genpd_dev_pm_attach(struct device *dev)
 
 	ret = of_parse_phandle_with_args(dev->of_node, "power-domains",
 					"#power-domain-cells", 0, &pd_args);
-	if (ret < 0) {
-		if (ret != -ENOENT)
-			return ret;
-
-		/*
-		 * Try legacy Samsung-specific bindings
-		 * (for backwards compatibility of DT ABI)
-		 */
-		pd_args.args_count = 0;
-		pd_args.np = of_parse_phandle(dev->of_node,
-						"samsung,power-domain", 0);
-		if (!pd_args.np)
-			return -ENOENT;
-	}
+	if (ret < 0)
+		return ret;
 
 	mutex_lock(&gpd_list_lock);
 	pd = genpd_get_from_provider(&pd_args);

From 1d0d064307cbfd8546841f6e9d94d02c55e45e1e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 7 Dec 2017 15:15:19 +0530
Subject: [PATCH 20/89] cpufreq: longhaul: Revert transition_delay_us to 200 ms

The commit e948bc8fbee0 ("cpufreq: Cap the default transition delay
value to 10 ms") caused a regression on EPIA-M min-ITX computer where
shutdown or reboot hangs occasionally with a print message like:

longhaul: Warning: Timeout while waiting for idle PCI bus
cpufreq: __target_index: Failed to change cpu frequency: -16

This probably happens because the cpufreq governor tries to change the
frequency of the CPU faster than allowed by the hardware.

Before the above commit, the default transition delay was set to 200 ms
for a transition_latency of 200000 ns. Lets revert back to that
transition delay value to fix it. Note that several other transition
delay values were tested like 20 ms and 30 ms and none of them have
resolved system hang issue completely.

Fixes: e948bc8fbee0 (cpufreq: Cap the default transition delay value to 10 ms)
Reported-by: Meelis Roos <mroos@linux.ee>
Suggested-by: Rafael J. Wysocki <rjw@rjwysocki.net>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/longhaul.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c
index c46a12df40dd..5faa37c5b091 100644
--- a/drivers/cpufreq/longhaul.c
+++ b/drivers/cpufreq/longhaul.c
@@ -894,7 +894,7 @@ static int longhaul_cpu_init(struct cpufreq_policy *policy)
 	if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
 		longhaul_setup_voltagescaling();
 
-	policy->cpuinfo.transition_latency = 200000;	/* nsec */
+	policy->transition_delay_us = 200000;	/* usec */
 
 	return cpufreq_table_validate_and_show(policy, longhaul_table);
 }

From 7e6a70a57800014743ecfae7023c379388eff121 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Fri, 8 Dec 2017 11:56:10 +0900
Subject: [PATCH 21/89] PM / core: remove unneeded kallsyms include

The file was converted from print_fn_descriptor_symbol()
to %pF some time ago (c80cfb0406c01bb "vsprintf: use new
vsprintf symbolic function pointer format"). kallsyms does
not seem to be needed anymore.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index c0d5f4a3611d..d8aa88baf9c1 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -18,7 +18,6 @@
  */
 
 #include <linux/device.h>
-#include <linux/kallsyms.h>
 #include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/pm.h>

From a9596dbc3582c19da0958df25b3852696a0f5ae5 Mon Sep 17 00:00:00 2001
From: Andrew-sh Cheng <andrew-sh.cheng@mediatek.com>
Date: Fri, 8 Dec 2017 14:07:55 +0800
Subject: [PATCH 22/89] cpufreq: mediatek: add mt2712 into compatible list

Support mt2712 in mediatek-cpufreq.c

Signed-off-by: Andrew-sh Cheng <andrew-sh.cheng@mediatek.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/mediatek-cpufreq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c
index e0d5090b303d..b783919f063d 100644
--- a/drivers/cpufreq/mediatek-cpufreq.c
+++ b/drivers/cpufreq/mediatek-cpufreq.c
@@ -574,6 +574,7 @@ static struct platform_driver mtk_cpufreq_platdrv = {
 /* List of machines supported by this driver */
 static const struct of_device_id mtk_cpufreq_machines[] __initconst = {
 	{ .compatible = "mediatek,mt2701", },
+	{ .compatible = "mediatek,mt2712", },
 	{ .compatible = "mediatek,mt7622", },
 	{ .compatible = "mediatek,mt7623", },
 	{ .compatible = "mediatek,mt817x", },

From 6066998cbd2b1012a8d5bc9a2957cfd0ad53150e Mon Sep 17 00:00:00 2001
From: Andrew-sh Cheng <andrew-sh.cheng@mediatek.com>
Date: Fri, 8 Dec 2017 14:07:56 +0800
Subject: [PATCH 23/89] cpufreq: mediatek: add mediatek related projects into
 blacklist

mediatek projects will use mediate-cpufreq.c as cpufreq driver,
instead of using cpufreq_dt.c
Add mediatek related projects into cpufreq-dt blacklist

Signed-off-by: Andrew-sh Cheng <andrew-sh.cheng@mediatek.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index ecc56e26f8f6..3b585e4bfac5 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -108,6 +108,14 @@ static const struct of_device_id blacklist[] __initconst = {
 
 	{ .compatible = "marvell,armadaxp", },
 
+	{ .compatible = "mediatek,mt2701", },
+	{ .compatible = "mediatek,mt2712", },
+	{ .compatible = "mediatek,mt7622", },
+	{ .compatible = "mediatek,mt7623", },
+	{ .compatible = "mediatek,mt817x", },
+	{ .compatible = "mediatek,mt8173", },
+	{ .compatible = "mediatek,mt8176", },
+
 	{ .compatible = "nvidia,tegra124", },
 
 	{ .compatible = "st,stih407", },

From 34fb8f0ba9ceea88e116688f9f53e3802c38aafb Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 10 Dec 2017 00:56:50 +0100
Subject: [PATCH 24/89] PM / core: Use dev_pm_skip_next_resume_phases()
 internally

Make the PM core call dev_pm_skip_next_resume_phases() to skip the
"early resume" and "resume" phases of system-wide transitions to the
working state for a given device instead of clearing the relevant
status bits for it directly.

No intentional changes in functionality.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/base/power/main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index d8aa88baf9c1..cd48b1c69167 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -593,8 +593,7 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn
 		 * device again.
 		 */
 		pm_runtime_set_suspended(dev);
-		dev->power.is_late_suspended = false;
-		dev->power.is_suspended = false;
+		dev_pm_skip_next_resume_phases(dev);
 	}
 
  Out:

From a248efb3d634854dfcf43a165003597542e90d11 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 30 Nov 2017 12:57:00 +0100
Subject: [PATCH 25/89] bus: simple-pm-bus: convert bool SIMPLE_PM_BUS to
 tristate

The Kconfig currently controlling compilation of this code is:

config SIMPLE_PM_BUS
        bool "Simple Power-Managed Bus Driver"

...meaning that it currently is not being built as a module by anyone.

In removing the orphaned modular support in a previous patch set,
Geert indicated he'd rather see this code converted to tristate.

I normally don't do that because it extends functionality that I
can't easily run time test or even know if the use case makes sense,
but since in this case the author has nominated it as such, we do
the conversion here.

Note that doesn't change the lack of run time testing ; this change
is only tested for sucessful compile and modpost.

[geert: Ethernet is probed successfully on sh73a0/kzm9g after
        insmodding simple-pm-bus.ko]

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/bus/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/bus/Kconfig b/drivers/bus/Kconfig
index dc7b3c7b7d42..57e011d36a79 100644
--- a/drivers/bus/Kconfig
+++ b/drivers/bus/Kconfig
@@ -120,7 +120,7 @@ config QCOM_EBI2
 	  SRAM, ethernet adapters, FPGAs and LCD displays.
 
 config SIMPLE_PM_BUS
-	bool "Simple Power-Managed Bus Driver"
+	tristate "Simple Power-Managed Bus Driver"
 	depends on OF && PM
 	help
 	  Driver for transparent busses that don't need a real driver, but

From 0e96a0c83f0842d5cfd83b0a896bc82ff61f9849 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Thu, 14 Dec 2017 17:16:29 -0800
Subject: [PATCH 26/89] cpupower: Remove FSF address

Checkpatch in the kernel now complains about having the FSF address
in comments. Other tools such as rpmlint are now starting to do the
same thing. Remove the FSF address to reduce warnings on multiple tools.

Signed-off-by: Laura Abbott <labbott@redhat.com>
Acked-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
---
 tools/power/cpupower/lib/cpufreq.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tools/power/cpupower/lib/cpufreq.h b/tools/power/cpupower/lib/cpufreq.h
index 3b005c39f068..60beaf5ed2ea 100644
--- a/tools/power/cpupower/lib/cpufreq.h
+++ b/tools/power/cpupower/lib/cpufreq.h
@@ -11,10 +11,6 @@
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef __CPUPOWER_CPUFREQ_H__

From b17d2f8d373ef648a05ad7b894722f154d6660f4 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Wed, 13 Dec 2017 18:05:35 +0100
Subject: [PATCH 27/89] cpufreq: ARM: sort the Kconfig menu

Group all the related big LITTLE configuration together and sort the
other entries in alphabetic order.

Also fixing tab vs space issue while mofifying these entries.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig.arm | 81 ++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 41 deletions(-)

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index bdce4488ded1..beb8826afbb1 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -2,6 +2,22 @@
 # ARM CPU Frequency scaling drivers
 #
 
+config ACPI_CPPC_CPUFREQ
+	tristate "CPUFreq driver based on the ACPI CPPC spec"
+	depends on ACPI_PROCESSOR
+	select ACPI_CPPC_LIB
+	help
+	  This adds a CPUFreq driver which uses CPPC methods
+	  as described in the ACPIv5.1 spec. CPPC stands for
+	  Collaborative Processor Performance Controls. It
+	  is based on an abstract continuous scale of CPU
+	  performance values which allows the remote power
+	  processor to flexibly optimize for power and
+	  performance. CPPC relies on power management firmware
+	  support for its operation.
+
+	  If in doubt, say N.
+
 # big LITTLE core layer and glue drivers
 config ARM_BIG_LITTLE_CPUFREQ
 	tristate "Generic ARM big LITTLE CPUfreq driver"
@@ -12,6 +28,30 @@ config ARM_BIG_LITTLE_CPUFREQ
 	help
 	  This enables the Generic CPUfreq driver for ARM big.LITTLE platforms.
 
+config ARM_DT_BL_CPUFREQ
+	tristate "Generic probing via DT for ARM big LITTLE CPUfreq driver"
+	depends on ARM_BIG_LITTLE_CPUFREQ && OF
+	help
+	  This enables probing via DT for Generic CPUfreq driver for ARM
+	  big.LITTLE platform. This gets frequency tables from DT.
+
+config ARM_SCPI_CPUFREQ
+	tristate "SCPI based CPUfreq driver"
+	depends on ARM_BIG_LITTLE_CPUFREQ && ARM_SCPI_PROTOCOL && COMMON_CLK_SCPI
+	help
+	  This adds the CPUfreq driver support for ARM big.LITTLE platforms
+	  using SCPI protocol for CPU power management.
+
+	  This driver uses SCPI Message Protocol driver to interact with the
+	  firmware providing the CPU DVFS functionality.
+
+config ARM_VEXPRESS_SPC_CPUFREQ
+	tristate "Versatile Express SPC based CPUfreq driver"
+	depends on ARM_BIG_LITTLE_CPUFREQ && ARCH_VEXPRESS_SPC
+	help
+	  This add the CPUfreq driver support for Versatile Express
+	  big.LITTLE platforms using SPC for power management.
+
 config ARM_BRCMSTB_AVS_CPUFREQ
 	tristate "Broadcom STB AVS CPUfreq driver"
 	depends on ARCH_BRCMSTB || COMPILE_TEST
@@ -33,20 +73,6 @@ config ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
 
 	  If in doubt, say N.
 
-config ARM_DT_BL_CPUFREQ
-	tristate "Generic probing via DT for ARM big LITTLE CPUfreq driver"
-	depends on ARM_BIG_LITTLE_CPUFREQ && OF
-	help
-	  This enables probing via DT for Generic CPUfreq driver for ARM
-	  big.LITTLE platform. This gets frequency tables from DT.
-
-config ARM_VEXPRESS_SPC_CPUFREQ
-        tristate "Versatile Express SPC based CPUfreq driver"
-	depends on ARM_BIG_LITTLE_CPUFREQ && ARCH_VEXPRESS_SPC
-        help
-          This add the CPUfreq driver support for Versatile Express
-	  big.LITTLE platforms using SPC for power management.
-
 config ARM_EXYNOS5440_CPUFREQ
 	tristate "SAMSUNG EXYNOS5440"
 	depends on SOC_EXYNOS5440
@@ -205,16 +231,6 @@ config ARM_SA1100_CPUFREQ
 config ARM_SA1110_CPUFREQ
 	bool
 
-config ARM_SCPI_CPUFREQ
-        tristate "SCPI based CPUfreq driver"
-	depends on ARM_BIG_LITTLE_CPUFREQ && ARM_SCPI_PROTOCOL && COMMON_CLK_SCPI
-        help
-	  This adds the CPUfreq driver support for ARM big.LITTLE platforms
-	  using SCPI protocol for CPU power management.
-
-	  This driver uses SCPI Message Protocol driver to interact with the
-	  firmware providing the CPU DVFS functionality.
-
 config ARM_SPEAR_CPUFREQ
 	bool "SPEAr CPUFreq support"
 	depends on PLAT_SPEAR
@@ -275,20 +291,3 @@ config ARM_PXA2xx_CPUFREQ
 	  This add the CPUFreq driver support for Intel PXA2xx SOCs.
 
 	  If in doubt, say N.
-
-config ACPI_CPPC_CPUFREQ
-	tristate "CPUFreq driver based on the ACPI CPPC spec"
-	depends on ACPI_PROCESSOR
-	select ACPI_CPPC_LIB
-	default n
-	help
-	  This adds a CPUFreq driver which uses CPPC methods
-	  as described in the ACPIv5.1 spec. CPPC stands for
-	  Collaborative Processor Performance Controls. It
-	  is based on an abstract continuous scale of CPU
-	  performance values which allows the remote power
-	  processor to flexibly optimize for power and
-	  performance. CPPC relies on power management firmware
-	  support for its operation.
-
-	  If in doubt, say N.

From 16630642f174cfc9b169c39cc74ec8e1360a9c55 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Wed, 13 Dec 2017 18:05:36 +0100
Subject: [PATCH 28/89] cpufreq: sort the drivers in ARM part

Keep the driver files alphabetically sorted.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 812f9e0d01a3..d762e76887e7 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -53,22 +53,24 @@ obj-$(CONFIG_ARM_BIG_LITTLE_CPUFREQ)	+= arm_big_little.o
 obj-$(CONFIG_ARM_DT_BL_CPUFREQ)		+= arm_big_little_dt.o
 
 obj-$(CONFIG_ARM_BRCMSTB_AVS_CPUFREQ)	+= brcmstb-avs-cpufreq.o
+obj-$(CONFIG_ACPI_CPPC_CPUFREQ)		+= cppc_cpufreq.o
 obj-$(CONFIG_ARCH_DAVINCI)		+= davinci-cpufreq.o
 obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ)	+= exynos5440-cpufreq.o
 obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ)	+= highbank-cpufreq.o
 obj-$(CONFIG_ARM_IMX6Q_CPUFREQ)		+= imx6q-cpufreq.o
 obj-$(CONFIG_ARM_KIRKWOOD_CPUFREQ)	+= kirkwood-cpufreq.o
 obj-$(CONFIG_ARM_MEDIATEK_CPUFREQ)	+= mediatek-cpufreq.o
+obj-$(CONFIG_MACH_MVEBU_V7)		+= mvebu-cpufreq.o
 obj-$(CONFIG_ARM_OMAP2PLUS_CPUFREQ)	+= omap-cpufreq.o
 obj-$(CONFIG_ARM_PXA2xx_CPUFREQ)	+= pxa2xx-cpufreq.o
 obj-$(CONFIG_PXA3xx)			+= pxa3xx-cpufreq.o
-obj-$(CONFIG_ARM_S3C24XX_CPUFREQ)	+= s3c24xx-cpufreq.o
-obj-$(CONFIG_ARM_S3C24XX_CPUFREQ_DEBUGFS) += s3c24xx-cpufreq-debugfs.o
 obj-$(CONFIG_ARM_S3C2410_CPUFREQ)	+= s3c2410-cpufreq.o
 obj-$(CONFIG_ARM_S3C2412_CPUFREQ)	+= s3c2412-cpufreq.o
 obj-$(CONFIG_ARM_S3C2416_CPUFREQ)	+= s3c2416-cpufreq.o
 obj-$(CONFIG_ARM_S3C2440_CPUFREQ)	+= s3c2440-cpufreq.o
 obj-$(CONFIG_ARM_S3C64XX_CPUFREQ)	+= s3c64xx-cpufreq.o
+obj-$(CONFIG_ARM_S3C24XX_CPUFREQ)	+= s3c24xx-cpufreq.o
+obj-$(CONFIG_ARM_S3C24XX_CPUFREQ_DEBUGFS) += s3c24xx-cpufreq-debugfs.o
 obj-$(CONFIG_ARM_S5PV210_CPUFREQ)	+= s5pv210-cpufreq.o
 obj-$(CONFIG_ARM_SA1100_CPUFREQ)	+= sa1100-cpufreq.o
 obj-$(CONFIG_ARM_SA1110_CPUFREQ)	+= sa1110-cpufreq.o
@@ -81,8 +83,6 @@ obj-$(CONFIG_ARM_TEGRA124_CPUFREQ)	+= tegra124-cpufreq.o
 obj-$(CONFIG_ARM_TEGRA186_CPUFREQ)	+= tegra186-cpufreq.o
 obj-$(CONFIG_ARM_TI_CPUFREQ)		+= ti-cpufreq.o
 obj-$(CONFIG_ARM_VEXPRESS_SPC_CPUFREQ)	+= vexpress-spc-cpufreq.o
-obj-$(CONFIG_ACPI_CPPC_CPUFREQ) += cppc_cpufreq.o
-obj-$(CONFIG_MACH_MVEBU_V7)		+= mvebu-cpufreq.o
 
 
 ##################################################################################

From b3371600926638842c460d652599c4dddef72da6 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Wed, 13 Dec 2017 18:29:13 +0100
Subject: [PATCH 29/89] cpufreq: mvebu: Free the clock reference in the normal
 path

In case of error the clock reference was freed but not in normal path
once it was nor more used. This patch fixes it.

Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/mvebu-cpufreq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/mvebu-cpufreq.c b/drivers/cpufreq/mvebu-cpufreq.c
index ed915ee85dd9..c043aad8e3a0 100644
--- a/drivers/cpufreq/mvebu-cpufreq.c
+++ b/drivers/cpufreq/mvebu-cpufreq.c
@@ -99,6 +99,7 @@ static int __init armada_xp_pmsu_cpufreq_init(void)
 		if (ret)
 			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
 				__func__, ret);
+		clk_put(clk);
 	}
 
 	platform_device_register_simple("cpufreq-dt", -1, NULL, 0);

From 3f4590a4a371006f66385eb5b739d5d01249ee3f Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Wed, 13 Dec 2017 18:29:14 +0100
Subject: [PATCH 30/89] cpufreq: mvebu: Free opp if registering failed

Since the introduction of this driver, the functions to remove the opp
were added. So stop claiming we can't remove opp and use one of them in
case of failure.

Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/mvebu-cpufreq.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/cpufreq/mvebu-cpufreq.c b/drivers/cpufreq/mvebu-cpufreq.c
index c043aad8e3a0..31513bd42705 100644
--- a/drivers/cpufreq/mvebu-cpufreq.c
+++ b/drivers/cpufreq/mvebu-cpufreq.c
@@ -76,12 +76,6 @@ static int __init armada_xp_pmsu_cpufreq_init(void)
 			return PTR_ERR(clk);
 		}
 
-		/*
-		 * In case of a failure of dev_pm_opp_add(), we don't
-		 * bother with cleaning up the registered OPP (there's
-		 * no function to do so), and simply cancel the
-		 * registration of the cpufreq device.
-		 */
 		ret = dev_pm_opp_add(cpu_dev, clk_get_rate(clk), 0);
 		if (ret) {
 			clk_put(clk);
@@ -91,7 +85,8 @@ static int __init armada_xp_pmsu_cpufreq_init(void)
 		ret = dev_pm_opp_add(cpu_dev, clk_get_rate(clk) / 2, 0);
 		if (ret) {
 			clk_put(clk);
-			return ret;
+			dev_err(cpu_dev, "Failed to register OPPs\n");
+			goto opp_register_failed;
 		}
 
 		ret = dev_pm_opp_set_sharing_cpus(cpu_dev,
@@ -104,5 +99,11 @@ static int __init armada_xp_pmsu_cpufreq_init(void)
 
 	platform_device_register_simple("cpufreq-dt", -1, NULL, 0);
 	return 0;
+
+opp_register_failed:
+	/* As registering has failed remove all the opp for all cpus */
+	dev_pm_opp_cpumask_remove_table(cpu_possible_mask);
+
+	return ret;
 }
 device_initcall(armada_xp_pmsu_cpufreq_init);

From 7a62f48b271919b78f23c216ede6ac5d2b8b0368 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Thu, 14 Dec 2017 16:00:03 +0100
Subject: [PATCH 31/89] dt-bindings: marvell: Add documentation for the North
 Bridge PM on Armada 37xx

Extend the documentation of the Armada 37xx SoC with the the North
Bridge Power Management component.

Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../bindings/arm/marvell/armada-37xx.txt      | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt b/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt
index 51336e5fc761..35c3c3460d17 100644
--- a/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt
+++ b/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt
@@ -14,3 +14,22 @@ following property before the previous one:
 Example:
 
 compatible = "marvell,armada-3720-db", "marvell,armada3720", "marvell,armada3710";
+
+
+Power management
+----------------
+
+For power management (particularly DVFS and AVS), the North Bridge
+Power Management component is needed:
+
+Required properties:
+- compatible     : should contain "marvell,armada-3700-nb-pm", "syscon";
+- reg            : the register start and length for the North Bridge
+		    Power Management
+
+Example:
+
+nb_pm: syscon@14000 {
+	compatible = "marvell,armada-3700-nb-pm", "syscon";
+	reg = <0x14000 0x60>;
+}

From d71f617af7849e65884e2a5bd590807ca6b905b8 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Thu, 14 Dec 2017 16:00:04 +0100
Subject: [PATCH 32/89] MAINTAINERS: add new entries for Armada 37xx cpufreq
 driver

This new driver belongs to the mvebu family, update the MAINTAINER file
to document it.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index d4fdcb12616c..8b275dd1105a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1582,6 +1582,7 @@ F:	arch/arm/boot/dts/kirkwood*
 F:	arch/arm/configs/mvebu_*_defconfig
 F:	arch/arm/mach-mvebu/
 F:	arch/arm64/boot/dts/marvell/armada*
+F:	drivers/cpufreq/armada-37xx-cpufreq.c
 F:	drivers/cpufreq/mvebu-cpufreq.c
 F:	drivers/irqchip/irq-armada-370-xp.c
 F:	drivers/irqchip/irq-mvebu-*

From 92ce45fb875d7c3e021cc454482fe0687ff54f29 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Thu, 14 Dec 2017 16:00:05 +0100
Subject: [PATCH 33/89] cpufreq: Add DVFS support for Armada 37xx

This patch adds DVFS support for the Armada 37xx SoCs

There are up to four CPU frequency loads for Armada 37xx controlled by
the hardware.

This driver associates the CPU load level to a frequency, then the
hardware will switch while selecting a load level.

The hardware also can associate a voltage for each level (AVS support)
but it is not yet supported

Tested-by: Andre Heider <a.heider@gmail.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig.arm           |   7 +
 drivers/cpufreq/Makefile              |   1 +
 drivers/cpufreq/armada-37xx-cpufreq.c | 241 ++++++++++++++++++++++++++
 3 files changed, 249 insertions(+)
 create mode 100644 drivers/cpufreq/armada-37xx-cpufreq.c

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index beb8826afbb1..3a88e33b0cfe 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -18,6 +18,13 @@ config ACPI_CPPC_CPUFREQ
 
 	  If in doubt, say N.
 
+config ARM_ARMADA_37XX_CPUFREQ
+	tristate "Armada 37xx CPUFreq support"
+	depends on ARCH_MVEBU
+	help
+	  This adds the CPUFreq driver support for Marvell Armada 37xx SoCs.
+	  The Armada 37xx PMU supports 4 frequency and VDD levels.
+
 # big LITTLE core layer and glue drivers
 config ARM_BIG_LITTLE_CPUFREQ
 	tristate "Generic ARM big LITTLE CPUfreq driver"
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index d762e76887e7..e07715ce8844 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_ARM_BIG_LITTLE_CPUFREQ)	+= arm_big_little.o
 # LITTLE drivers, so that it is probed last.
 obj-$(CONFIG_ARM_DT_BL_CPUFREQ)		+= arm_big_little_dt.o
 
+obj-$(CONFIG_ARM_ARMADA_37XX_CPUFREQ)	+= armada-37xx-cpufreq.o
 obj-$(CONFIG_ARM_BRCMSTB_AVS_CPUFREQ)	+= brcmstb-avs-cpufreq.o
 obj-$(CONFIG_ACPI_CPPC_CPUFREQ)		+= cppc_cpufreq.o
 obj-$(CONFIG_ARCH_DAVINCI)		+= davinci-cpufreq.o
diff --git a/drivers/cpufreq/armada-37xx-cpufreq.c b/drivers/cpufreq/armada-37xx-cpufreq.c
new file mode 100644
index 000000000000..c6ebc88a7d8d
--- /dev/null
+++ b/drivers/cpufreq/armada-37xx-cpufreq.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * CPU frequency scaling support for Armada 37xx platform.
+ *
+ * Copyright (C) 2017 Marvell
+ *
+ * Gregory CLEMENT <gregory.clement@free-electrons.com>
+ */
+
+#include <linux/clk.h>
+#include <linux/cpu.h>
+#include <linux/cpufreq.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/pm_opp.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+/* Power management in North Bridge register set */
+#define ARMADA_37XX_NB_L0L1	0x18
+#define ARMADA_37XX_NB_L2L3	0x1C
+#define  ARMADA_37XX_NB_TBG_DIV_OFF	13
+#define  ARMADA_37XX_NB_TBG_DIV_MASK	0x7
+#define  ARMADA_37XX_NB_CLK_SEL_OFF	11
+#define  ARMADA_37XX_NB_CLK_SEL_MASK	0x1
+#define  ARMADA_37XX_NB_CLK_SEL_TBG	0x1
+#define  ARMADA_37XX_NB_TBG_SEL_OFF	9
+#define  ARMADA_37XX_NB_TBG_SEL_MASK	0x3
+#define  ARMADA_37XX_NB_VDD_SEL_OFF	6
+#define  ARMADA_37XX_NB_VDD_SEL_MASK	0x3
+#define  ARMADA_37XX_NB_CONFIG_SHIFT	16
+#define ARMADA_37XX_NB_DYN_MOD	0x24
+#define  ARMADA_37XX_NB_CLK_SEL_EN	BIT(26)
+#define  ARMADA_37XX_NB_TBG_EN		BIT(28)
+#define  ARMADA_37XX_NB_DIV_EN		BIT(29)
+#define  ARMADA_37XX_NB_VDD_EN		BIT(30)
+#define  ARMADA_37XX_NB_DFS_EN		BIT(31)
+#define ARMADA_37XX_NB_CPU_LOAD 0x30
+#define  ARMADA_37XX_NB_CPU_LOAD_MASK	0x3
+#define  ARMADA_37XX_DVFS_LOAD_0	0
+#define  ARMADA_37XX_DVFS_LOAD_1	1
+#define  ARMADA_37XX_DVFS_LOAD_2	2
+#define  ARMADA_37XX_DVFS_LOAD_3	3
+
+/*
+ * On Armada 37xx the Power management manages 4 level of CPU load,
+ * each level can be associated with a CPU clock source, a CPU
+ * divider, a VDD level, etc...
+ */
+#define LOAD_LEVEL_NR	4
+
+struct armada_37xx_dvfs {
+	u32 cpu_freq_max;
+	u8 divider[LOAD_LEVEL_NR];
+};
+
+static struct armada_37xx_dvfs armada_37xx_dvfs[] = {
+	{.cpu_freq_max = 1200*1000*1000, .divider = {1, 2, 4, 6} },
+	{.cpu_freq_max = 1000*1000*1000, .divider = {1, 2, 4, 5} },
+	{.cpu_freq_max = 800*1000*1000,  .divider = {1, 2, 3, 4} },
+	{.cpu_freq_max = 600*1000*1000,  .divider = {2, 4, 5, 6} },
+};
+
+static struct armada_37xx_dvfs *armada_37xx_cpu_freq_info_get(u32 freq)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(armada_37xx_dvfs); i++) {
+		if (freq == armada_37xx_dvfs[i].cpu_freq_max)
+			return &armada_37xx_dvfs[i];
+	}
+
+	pr_err("Unsupported CPU frequency %d MHz\n", freq/1000000);
+	return NULL;
+}
+
+/*
+ * Setup the four level managed by the hardware. Once the four level
+ * will be configured then the DVFS will be enabled.
+ */
+static void __init armada37xx_cpufreq_dvfs_setup(struct regmap *base,
+						 struct clk *clk, u8 *divider)
+{
+	int load_lvl;
+	struct clk *parent;
+
+	for (load_lvl = 0; load_lvl < LOAD_LEVEL_NR; load_lvl++) {
+		unsigned int reg, mask, val, offset = 0;
+
+		if (load_lvl <= ARMADA_37XX_DVFS_LOAD_1)
+			reg = ARMADA_37XX_NB_L0L1;
+		else
+			reg = ARMADA_37XX_NB_L2L3;
+
+		if (load_lvl == ARMADA_37XX_DVFS_LOAD_0 ||
+		    load_lvl == ARMADA_37XX_DVFS_LOAD_2)
+			offset += ARMADA_37XX_NB_CONFIG_SHIFT;
+
+		/* Set cpu clock source, for all the level we use TBG */
+		val = ARMADA_37XX_NB_CLK_SEL_TBG << ARMADA_37XX_NB_CLK_SEL_OFF;
+		mask = (ARMADA_37XX_NB_CLK_SEL_MASK
+			<< ARMADA_37XX_NB_CLK_SEL_OFF);
+
+		/*
+		 * Set cpu divider based on the pre-computed array in
+		 * order to have balanced step.
+		 */
+		val |= divider[load_lvl] << ARMADA_37XX_NB_TBG_DIV_OFF;
+		mask |= (ARMADA_37XX_NB_TBG_DIV_MASK
+			<< ARMADA_37XX_NB_TBG_DIV_OFF);
+
+		/* Set VDD divider which is actually the load level. */
+		val |= load_lvl << ARMADA_37XX_NB_VDD_SEL_OFF;
+		mask |= (ARMADA_37XX_NB_VDD_SEL_MASK
+			<< ARMADA_37XX_NB_VDD_SEL_OFF);
+
+		val <<= offset;
+		mask <<= offset;
+
+		regmap_update_bits(base, reg, mask, val);
+	}
+
+	/*
+	 * Set cpu clock source, for all the level we keep the same
+	 * clock source that the one already configured. For this one
+	 * we need to use the clock framework
+	 */
+	parent = clk_get_parent(clk);
+	clk_set_parent(clk, parent);
+}
+
+static void __init armada37xx_cpufreq_disable_dvfs(struct regmap *base)
+{
+	unsigned int reg = ARMADA_37XX_NB_DYN_MOD,
+		mask = ARMADA_37XX_NB_DFS_EN;
+
+	regmap_update_bits(base, reg, mask, 0);
+}
+
+static void __init armada37xx_cpufreq_enable_dvfs(struct regmap *base)
+{
+	unsigned int val, reg = ARMADA_37XX_NB_CPU_LOAD,
+		mask = ARMADA_37XX_NB_CPU_LOAD_MASK;
+
+	/* Start with the highest load (0) */
+	val = ARMADA_37XX_DVFS_LOAD_0;
+	regmap_update_bits(base, reg, mask, val);
+
+	/* Now enable DVFS for the CPUs */
+	reg = ARMADA_37XX_NB_DYN_MOD;
+	mask =	ARMADA_37XX_NB_CLK_SEL_EN | ARMADA_37XX_NB_TBG_EN |
+		ARMADA_37XX_NB_DIV_EN | ARMADA_37XX_NB_VDD_EN |
+		ARMADA_37XX_NB_DFS_EN;
+
+	regmap_update_bits(base, reg, mask, mask);
+}
+
+static int __init armada37xx_cpufreq_driver_init(void)
+{
+	struct armada_37xx_dvfs *dvfs;
+	struct platform_device *pdev;
+	unsigned int cur_frequency;
+	struct regmap *nb_pm_base;
+	struct device *cpu_dev;
+	int load_lvl, ret;
+	struct clk *clk;
+
+	nb_pm_base =
+		syscon_regmap_lookup_by_compatible("marvell,armada-3700-nb-pm");
+
+	if (IS_ERR(nb_pm_base))
+		return -ENODEV;
+
+	/* Before doing any configuration on the DVFS first, disable it */
+	armada37xx_cpufreq_disable_dvfs(nb_pm_base);
+
+	/*
+	 * On CPU 0 register the operating points supported (which are
+	 * the nominal CPU frequency and full integer divisions of
+	 * it).
+	 */
+	cpu_dev = get_cpu_device(0);
+	if (!cpu_dev) {
+		dev_err(cpu_dev, "Cannot get CPU\n");
+		return -ENODEV;
+	}
+
+	clk = clk_get(cpu_dev, 0);
+	if (IS_ERR(clk)) {
+		dev_err(cpu_dev, "Cannot get clock for CPU0\n");
+		return PTR_ERR(clk);
+	}
+
+	/* Get nominal (current) CPU frequency */
+	cur_frequency = clk_get_rate(clk);
+	if (!cur_frequency) {
+		dev_err(cpu_dev, "Failed to get clock rate for CPU\n");
+		return -EINVAL;
+	}
+
+	dvfs = armada_37xx_cpu_freq_info_get(cur_frequency);
+	if (!dvfs)
+		return -EINVAL;
+
+	armada37xx_cpufreq_dvfs_setup(nb_pm_base, clk, dvfs->divider);
+
+	for (load_lvl = ARMADA_37XX_DVFS_LOAD_0; load_lvl < LOAD_LEVEL_NR;
+	     load_lvl++) {
+		unsigned long freq = cur_frequency / dvfs->divider[load_lvl];
+
+		ret = dev_pm_opp_add(cpu_dev, freq, 0);
+		if (ret) {
+			/* clean-up the already added opp before leaving */
+			while (load_lvl-- > ARMADA_37XX_DVFS_LOAD_0) {
+				freq = cur_frequency / dvfs->divider[load_lvl];
+				dev_pm_opp_remove(cpu_dev, freq);
+			}
+			return ret;
+		}
+	}
+
+	/* Now that everything is setup, enable the DVFS at hardware level */
+	armada37xx_cpufreq_enable_dvfs(nb_pm_base);
+
+	pdev = platform_device_register_simple("cpufreq-dt", -1, NULL, 0);
+
+	return PTR_ERR_OR_ZERO(pdev);
+}
+/* late_initcall, to guarantee the driver is loaded after A37xx clock driver */
+late_initcall(armada37xx_cpufreq_driver_init);
+
+MODULE_AUTHOR("Gregory CLEMENT <gregory.clement@free-electrons.com>");
+MODULE_DESCRIPTION("Armada 37xx cpufreq driver");
+MODULE_LICENSE("GPL");

From db410b2b3839e962f9df4bc87b4fea9a2996047c Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Thu, 14 Dec 2017 22:25:25 -0600
Subject: [PATCH 34/89] cpufreq: ti-cpufreq: Convert to module_platform_driver

ti-cpufreq will be responsible for calling dev_pm_opp_set_regulators on
platforms that require AVS and ABB regulator support so we must be
able to defer probe if regulators are not yet available, so change
ti-cpufreq to be a module_platform_driver to allow for probe defer.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/ti-cpufreq.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c
index 923317f03b4b..b1c230a1e2aa 100644
--- a/drivers/cpufreq/ti-cpufreq.c
+++ b/drivers/cpufreq/ti-cpufreq.c
@@ -17,6 +17,7 @@
 #include <linux/cpu.h>
 #include <linux/io.h>
 #include <linux/mfd/syscon.h>
+#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
@@ -195,7 +196,7 @@ static const struct of_device_id ti_cpufreq_of_match[] = {
 	{},
 };
 
-static int ti_cpufreq_init(void)
+static int ti_cpufreq_probe(struct platform_device *pdev)
 {
 	u32 version[VERSION_COUNT];
 	struct device_node *np;
@@ -269,4 +270,22 @@ free_opp_data:
 
 	return ret;
 }
-device_initcall(ti_cpufreq_init);
+
+static int ti_cpufreq_init(void)
+{
+	platform_device_register_simple("ti-cpufreq", -1, NULL, 0);
+	return 0;
+}
+module_init(ti_cpufreq_init);
+
+static struct platform_driver ti_cpufreq_driver = {
+	.probe = ti_cpufreq_probe,
+	.driver = {
+		.name = "ti-cpufreq",
+	},
+};
+module_platform_driver(ti_cpufreq_driver);
+
+MODULE_DESCRIPTION("TI CPUFreq/OPP hw-supported driver");
+MODULE_AUTHOR("Dave Gerlach <d-gerlach@ti.com>");
+MODULE_LICENSE("GPL v2");

From c8343e83d45fb074da5dfc304e101995d19f05b2 Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Thu, 14 Dec 2017 22:25:26 -0600
Subject: [PATCH 35/89] cpufreq: ti-cpufreq: Add support for multiple
 regulators

Some platforms, like those in the DRA7 and AM57 families, require the
scaling of multiple regulators in order to properly support higher OPPs.
Let the ti-cpufreq driver determine when this is required and pass the
appropriate regulator names to the OPP core so that they can be properly
managed.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/ti-cpufreq.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c
index b1c230a1e2aa..a099b7bf74cd 100644
--- a/drivers/cpufreq/ti-cpufreq.c
+++ b/drivers/cpufreq/ti-cpufreq.c
@@ -51,6 +51,7 @@ struct ti_cpufreq_soc_data {
 	unsigned long efuse_mask;
 	unsigned long efuse_shift;
 	unsigned long rev_offset;
+	bool multi_regulator;
 };
 
 struct ti_cpufreq_data {
@@ -58,6 +59,7 @@ struct ti_cpufreq_data {
 	struct device_node *opp_node;
 	struct regmap *syscon;
 	const struct ti_cpufreq_soc_data *soc_data;
+	struct opp_table *opp_table;
 };
 
 static unsigned long amx3_efuse_xlate(struct ti_cpufreq_data *opp_data,
@@ -96,6 +98,7 @@ static struct ti_cpufreq_soc_data am3x_soc_data = {
 	.efuse_offset = 0x07fc,
 	.efuse_mask = 0x1fff,
 	.rev_offset = 0x600,
+	.multi_regulator = false,
 };
 
 static struct ti_cpufreq_soc_data am4x_soc_data = {
@@ -104,6 +107,7 @@ static struct ti_cpufreq_soc_data am4x_soc_data = {
 	.efuse_offset = 0x0610,
 	.efuse_mask = 0x3f,
 	.rev_offset = 0x600,
+	.multi_regulator = false,
 };
 
 static struct ti_cpufreq_soc_data dra7_soc_data = {
@@ -112,6 +116,7 @@ static struct ti_cpufreq_soc_data dra7_soc_data = {
 	.efuse_mask = 0xf80000,
 	.efuse_shift = 19,
 	.rev_offset = 0x204,
+	.multi_regulator = true,
 };
 
 /**
@@ -201,7 +206,9 @@ static int ti_cpufreq_probe(struct platform_device *pdev)
 	u32 version[VERSION_COUNT];
 	struct device_node *np;
 	const struct of_device_id *match;
+	struct opp_table *ti_opp_table;
 	struct ti_cpufreq_data *opp_data;
+	const char * const reg_names[] = {"vdd", "vbb"};
 	int ret;
 
 	np = of_find_node_by_path("/");
@@ -248,16 +255,29 @@ static int ti_cpufreq_probe(struct platform_device *pdev)
 	if (ret)
 		goto fail_put_node;
 
-	ret = PTR_ERR_OR_ZERO(dev_pm_opp_set_supported_hw(opp_data->cpu_dev,
-							  version, VERSION_COUNT));
-	if (ret) {
+	ti_opp_table = dev_pm_opp_set_supported_hw(opp_data->cpu_dev,
+						   version, VERSION_COUNT);
+	if (IS_ERR(ti_opp_table)) {
 		dev_err(opp_data->cpu_dev,
 			"Failed to set supported hardware\n");
+		ret = PTR_ERR(ti_opp_table);
 		goto fail_put_node;
 	}
 
-	of_node_put(opp_data->opp_node);
+	opp_data->opp_table = ti_opp_table;
 
+	if (opp_data->soc_data->multi_regulator) {
+		ti_opp_table = dev_pm_opp_set_regulators(opp_data->cpu_dev,
+							 reg_names,
+							 ARRAY_SIZE(reg_names));
+		if (IS_ERR(ti_opp_table)) {
+			dev_pm_opp_put_supported_hw(opp_data->opp_table);
+			ret =  PTR_ERR(ti_opp_table);
+			goto fail_put_node;
+		}
+	}
+
+	of_node_put(opp_data->opp_node);
 register_cpufreq_dt:
 	platform_device_register_simple("cpufreq-dt", -1, NULL, 0);
 

From 212b7287ae6098337fffa9c0cd7e139dceb98125 Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Thu, 14 Dec 2017 22:25:27 -0600
Subject: [PATCH 36/89] dt-bindings: opp: Introduce ti-opp-supply bindings

Document the devicetree bindings that describe Texas Instruments
opp-supply which allow a platform to describe multiple regulators and
additional information, such as registers containing data needed to
program aforementioned regulators.

Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../bindings/opp/ti-omap5-opp-supply.txt      | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/opp/ti-omap5-opp-supply.txt

diff --git a/Documentation/devicetree/bindings/opp/ti-omap5-opp-supply.txt b/Documentation/devicetree/bindings/opp/ti-omap5-opp-supply.txt
new file mode 100644
index 000000000000..832346e489a3
--- /dev/null
+++ b/Documentation/devicetree/bindings/opp/ti-omap5-opp-supply.txt
@@ -0,0 +1,63 @@
+Texas Instruments OMAP compatible OPP supply description
+
+OMAP5, DRA7, and AM57 family of SoCs have Class0 AVS eFuse registers which
+contain data that can be used to adjust voltages programmed for some of their
+supplies for more efficient operation. This binding provides the information
+needed to read these values and use them to program the main regulator during
+an OPP transitions.
+
+Also, some supplies may have an associated vbb-supply which is an Adaptive Body
+Bias regulator which much be transitioned in a specific sequence with regards
+to the vdd-supply and clk when making an OPP transition. By supplying two
+regulators to the device that will undergo OPP transitions we can make use
+of the multi regulator binding that is part of the OPP core described here [1]
+to describe both regulators needed by the platform.
+
+[1] Documentation/devicetree/bindings/opp/opp.txt
+
+Required Properties for Device Node:
+- vdd-supply: phandle to regulator controlling VDD supply
+- vbb-supply: phandle to regulator controlling Body Bias supply
+	      (Usually Adaptive Body Bias regulator)
+
+Required Properties for opp-supply node:
+- compatible: Should be one of:
+	"ti,omap-opp-supply" - basic OPP supply controlling VDD and VBB
+	"ti,omap5-opp-supply" - OMAP5+ optimized voltages in efuse(class0)VDD
+			    along with VBB
+	"ti,omap5-core-opp-supply" - OMAP5+ optimized voltages in efuse(class0) VDD
+			    but no VBB.
+- reg: Address and length of the efuse register set for the device (mandatory
+	only for "ti,omap5-opp-supply")
+- ti,efuse-settings: An array of u32 tuple items providing information about
+	optimized efuse configuration. Each item consists of the following:
+	volt: voltage in uV - reference voltage (OPP voltage)
+	efuse_offseet: efuse offset from reg where the optimized voltage is stored.
+- ti,absolute-max-voltage-uv: absolute maximum voltage for the OPP supply.
+
+Example:
+
+/* Device Node (CPU)  */
+cpus {
+	cpu0: cpu@0 {
+		device_type = "cpu";
+
+		...
+
+		vdd-supply = <&vcc>;
+		vbb-supply = <&abb_mpu>;
+	};
+};
+
+/* OMAP OPP Supply with Class0 registers */
+opp_supply_mpu: opp_supply@4a003b20 {
+	compatible = "ti,omap5-opp-supply";
+	reg = <0x4a003b20 0x8>;
+	ti,efuse-settings = <
+	/* uV   offset */
+	1060000 0x0
+	1160000 0x4
+	1210000 0x8
+	>;
+	ti,absolute-max-voltage-uv = <1500000>;
+};

From 9a835fa6e47f27b1ae71390b6f12efce7335aaac Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Thu, 14 Dec 2017 22:25:28 -0600
Subject: [PATCH 37/89] PM / OPP: Add ti-opp-supply driver

Introduce a ti-opp-supply driver that will use new multiple regulator
support that is part of the OPP core This is needed on TI platforms like
DRA7/AM57 in order to control both CPU regulator and Adaptive Body Bias
(ABB) regulator. These regulators must be scaled in sequence during an
OPP transition depending on whether or not the frequency is being scaled
up or down.

This driver also implements AVS Class0 for these parts by looking up the
required values from registers in the SoC and programming adjusted
optimal voltage values for each OPP.

Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/opp/Makefile        |   1 +
 drivers/opp/ti-opp-supply.c | 425 ++++++++++++++++++++++++++++++++++++
 2 files changed, 426 insertions(+)
 create mode 100644 drivers/opp/ti-opp-supply.c

diff --git a/drivers/opp/Makefile b/drivers/opp/Makefile
index e70ceb406fe9..6ce6aefacc81 100644
--- a/drivers/opp/Makefile
+++ b/drivers/opp/Makefile
@@ -2,3 +2,4 @@ ccflags-$(CONFIG_DEBUG_DRIVER)	:= -DDEBUG
 obj-y				+= core.o cpu.o
 obj-$(CONFIG_OF)		+= of.o
 obj-$(CONFIG_DEBUG_FS)		+= debugfs.o
+obj-$(CONFIG_ARM_TI_CPUFREQ)	+= ti-opp-supply.o
diff --git a/drivers/opp/ti-opp-supply.c b/drivers/opp/ti-opp-supply.c
new file mode 100644
index 000000000000..44dae3e51aac
--- /dev/null
+++ b/drivers/opp/ti-opp-supply.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2016-2017 Texas Instruments Incorporated - http://www.ti.com/
+ *	Nishanth Menon <nm@ti.com>
+ *	Dave Gerlach <d-gerlach@ti.com>
+ *
+ * TI OPP supply driver that provides override into the regulator control
+ * for generic opp core to handle devices with ABB regulator and/or
+ * SmartReflex Class0.
+ */
+#include <linux/clk.h>
+#include <linux/cpufreq.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/of_device.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/pm_opp.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+
+/**
+ * struct ti_opp_supply_optimum_voltage_table - optimized voltage table
+ * @reference_uv:	reference voltage (usually Nominal voltage)
+ * @optimized_uv:	Optimized voltage from efuse
+ */
+struct ti_opp_supply_optimum_voltage_table {
+	unsigned int reference_uv;
+	unsigned int optimized_uv;
+};
+
+/**
+ * struct ti_opp_supply_data - OMAP specific opp supply data
+ * @vdd_table:	Optimized voltage mapping table
+ * @num_vdd_table: number of entries in vdd_table
+ * @vdd_absolute_max_voltage_uv: absolute maximum voltage in UV for the supply
+ */
+struct ti_opp_supply_data {
+	struct ti_opp_supply_optimum_voltage_table *vdd_table;
+	u32 num_vdd_table;
+	u32 vdd_absolute_max_voltage_uv;
+};
+
+static struct ti_opp_supply_data opp_data;
+
+/**
+ * struct ti_opp_supply_of_data - device tree match data
+ * @flags:	specific type of opp supply
+ * @efuse_voltage_mask: mask required for efuse register representing voltage
+ * @efuse_voltage_uv: Are the efuse entries in micro-volts? if not, assume
+ *		milli-volts.
+ */
+struct ti_opp_supply_of_data {
+#define OPPDM_EFUSE_CLASS0_OPTIMIZED_VOLTAGE	BIT(1)
+#define OPPDM_HAS_NO_ABB			BIT(2)
+	const u8 flags;
+	const u32 efuse_voltage_mask;
+	const bool efuse_voltage_uv;
+};
+
+/**
+ * _store_optimized_voltages() - store optimized voltages
+ * @dev:	ti opp supply device for which we need to store info
+ * @data:	data specific to the device
+ *
+ * Picks up efuse based optimized voltages for VDD unique per device and
+ * stores it in internal data structure for use during transition requests.
+ *
+ * Return: If successful, 0, else appropriate error value.
+ */
+static int _store_optimized_voltages(struct device *dev,
+				     struct ti_opp_supply_data *data)
+{
+	void __iomem *base;
+	struct property *prop;
+	struct resource *res;
+	const __be32 *val;
+	int proplen, i;
+	int ret = 0;
+	struct ti_opp_supply_optimum_voltage_table *table;
+	const struct ti_opp_supply_of_data *of_data = dev_get_drvdata(dev);
+
+	/* pick up Efuse based voltages */
+	res = platform_get_resource(to_platform_device(dev), IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(dev, "Unable to get IO resource\n");
+		ret = -ENODEV;
+		goto out_map;
+	}
+
+	base = ioremap_nocache(res->start, resource_size(res));
+	if (!base) {
+		dev_err(dev, "Unable to map Efuse registers\n");
+		ret = -ENOMEM;
+		goto out_map;
+	}
+
+	/* Fetch efuse-settings. */
+	prop = of_find_property(dev->of_node, "ti,efuse-settings", NULL);
+	if (!prop) {
+		dev_err(dev, "No 'ti,efuse-settings' property found\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	proplen = prop->length / sizeof(int);
+	data->num_vdd_table = proplen / 2;
+	/* Verify for corrupted OPP entries in dt */
+	if (data->num_vdd_table * 2 * sizeof(int) != prop->length) {
+		dev_err(dev, "Invalid 'ti,efuse-settings'\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = of_property_read_u32(dev->of_node, "ti,absolute-max-voltage-uv",
+				   &data->vdd_absolute_max_voltage_uv);
+	if (ret) {
+		dev_err(dev, "ti,absolute-max-voltage-uv is missing\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	table = kzalloc(sizeof(*data->vdd_table) *
+				  data->num_vdd_table, GFP_KERNEL);
+	if (!table) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	data->vdd_table = table;
+
+	val = prop->value;
+	for (i = 0; i < data->num_vdd_table; i++, table++) {
+		u32 efuse_offset;
+		u32 tmp;
+
+		table->reference_uv = be32_to_cpup(val++);
+		efuse_offset = be32_to_cpup(val++);
+
+		tmp = readl(base + efuse_offset);
+		tmp &= of_data->efuse_voltage_mask;
+		tmp >>= __ffs(of_data->efuse_voltage_mask);
+
+		table->optimized_uv = of_data->efuse_voltage_uv ? tmp :
+					tmp * 1000;
+
+		dev_dbg(dev, "[%d] efuse=0x%08x volt_table=%d vset=%d\n",
+			i, efuse_offset, table->reference_uv,
+			table->optimized_uv);
+
+		/*
+		 * Some older samples might not have optimized efuse
+		 * Use reference voltage for those - just add debug message
+		 * for them.
+		 */
+		if (!table->optimized_uv) {
+			dev_dbg(dev, "[%d] efuse=0x%08x volt_table=%d:vset0\n",
+				i, efuse_offset, table->reference_uv);
+			table->optimized_uv = table->reference_uv;
+		}
+	}
+out:
+	iounmap(base);
+out_map:
+	return ret;
+}
+
+/**
+ * _free_optimized_voltages() - free resources for optvoltages
+ * @dev:	device for which we need to free info
+ * @data:	data specific to the device
+ */
+static void _free_optimized_voltages(struct device *dev,
+				     struct ti_opp_supply_data *data)
+{
+	kfree(data->vdd_table);
+	data->vdd_table = NULL;
+	data->num_vdd_table = 0;
+}
+
+/**
+ * _get_optimal_vdd_voltage() - Finds optimal voltage for the supply
+ * @dev:	device for which we need to find info
+ * @data:	data specific to the device
+ * @reference_uv:	reference voltage (OPP voltage) for which we need value
+ *
+ * Return: if a match is found, return optimized voltage, else return
+ * reference_uv, also return reference_uv if no optimization is needed.
+ */
+static int _get_optimal_vdd_voltage(struct device *dev,
+				    struct ti_opp_supply_data *data,
+				    int reference_uv)
+{
+	int i;
+	struct ti_opp_supply_optimum_voltage_table *table;
+
+	if (!data->num_vdd_table)
+		return reference_uv;
+
+	table = data->vdd_table;
+	if (!table)
+		return -EINVAL;
+
+	/* Find a exact match - this list is usually very small */
+	for (i = 0; i < data->num_vdd_table; i++, table++)
+		if (table->reference_uv == reference_uv)
+			return table->optimized_uv;
+
+	/* IF things are screwed up, we'd make a mess on console.. ratelimit */
+	dev_err_ratelimited(dev, "%s: Failed optimized voltage match for %d\n",
+			    __func__, reference_uv);
+	return reference_uv;
+}
+
+static int _opp_set_voltage(struct device *dev,
+			    struct dev_pm_opp_supply *supply,
+			    int new_target_uv, struct regulator *reg,
+			    char *reg_name)
+{
+	int ret;
+	unsigned long vdd_uv, uv_max;
+
+	if (new_target_uv)
+		vdd_uv = new_target_uv;
+	else
+		vdd_uv = supply->u_volt;
+
+	/*
+	 * If we do have an absolute max voltage specified, then we should
+	 * use that voltage instead to allow for cases where the voltage rails
+	 * are ganged (example if we set the max for an opp as 1.12v, and
+	 * the absolute max is 1.5v, for another rail to get 1.25v, it cannot
+	 * be achieved if the regulator is constrainted to max of 1.12v, even
+	 * if it can function at 1.25v
+	 */
+	if (opp_data.vdd_absolute_max_voltage_uv)
+		uv_max = opp_data.vdd_absolute_max_voltage_uv;
+	else
+		uv_max = supply->u_volt_max;
+
+	if (vdd_uv > uv_max ||
+	    vdd_uv < supply->u_volt_min ||
+	    supply->u_volt_min > uv_max) {
+		dev_warn(dev,
+			 "Invalid range voltages [Min:%lu target:%lu Max:%lu]\n",
+			 supply->u_volt_min, vdd_uv, uv_max);
+		return -EINVAL;
+	}
+
+	dev_dbg(dev, "%s scaling to %luuV[min %luuV max %luuV]\n", reg_name,
+		vdd_uv, supply->u_volt_min,
+		uv_max);
+
+	ret = regulator_set_voltage_triplet(reg,
+					    supply->u_volt_min,
+					    vdd_uv,
+					    uv_max);
+	if (ret) {
+		dev_err(dev, "%s failed for %luuV[min %luuV max %luuV]\n",
+			reg_name, vdd_uv, supply->u_volt_min,
+			uv_max);
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * ti_opp_supply_set_opp() - do the opp supply transition
+ * @data:	information on regulators and new and old opps provided by
+ *		opp core to use in transition
+ *
+ * Return: If successful, 0, else appropriate error value.
+ */
+int ti_opp_supply_set_opp(struct dev_pm_set_opp_data *data)
+{
+	struct dev_pm_opp_supply *old_supply_vdd = &data->old_opp.supplies[0];
+	struct dev_pm_opp_supply *old_supply_vbb = &data->old_opp.supplies[1];
+	struct dev_pm_opp_supply *new_supply_vdd = &data->new_opp.supplies[0];
+	struct dev_pm_opp_supply *new_supply_vbb = &data->new_opp.supplies[1];
+	struct device *dev = data->dev;
+	unsigned long old_freq = data->old_opp.rate, freq = data->new_opp.rate;
+	struct clk *clk = data->clk;
+	struct regulator *vdd_reg = data->regulators[0];
+	struct regulator *vbb_reg = data->regulators[1];
+	int vdd_uv;
+	int ret;
+
+	vdd_uv = _get_optimal_vdd_voltage(dev, &opp_data,
+					  new_supply_vbb->u_volt);
+
+	/* Scaling up? Scale voltage before frequency */
+	if (freq > old_freq) {
+		ret = _opp_set_voltage(dev, new_supply_vdd, vdd_uv, vdd_reg,
+				       "vdd");
+		if (ret)
+			goto restore_voltage;
+
+		ret = _opp_set_voltage(dev, new_supply_vbb, 0, vbb_reg, "vbb");
+		if (ret)
+			goto restore_voltage;
+	}
+
+	/* Change frequency */
+	dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n",
+		__func__, old_freq, freq);
+
+	ret = clk_set_rate(clk, freq);
+	if (ret) {
+		dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+			ret);
+		goto restore_voltage;
+	}
+
+	/* Scaling down? Scale voltage after frequency */
+	if (freq < old_freq) {
+		ret = _opp_set_voltage(dev, new_supply_vbb, 0, vbb_reg, "vbb");
+		if (ret)
+			goto restore_freq;
+
+		ret = _opp_set_voltage(dev, new_supply_vdd, vdd_uv, vdd_reg,
+				       "vdd");
+		if (ret)
+			goto restore_freq;
+	}
+
+	return 0;
+
+restore_freq:
+	ret = clk_set_rate(clk, old_freq);
+	if (ret)
+		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
+			__func__, old_freq);
+restore_voltage:
+	/* This shouldn't harm even if the voltages weren't updated earlier */
+	if (old_supply_vdd->u_volt) {
+		ret = _opp_set_voltage(dev, old_supply_vbb, 0, vbb_reg, "vbb");
+		if (ret)
+			return ret;
+
+		ret = _opp_set_voltage(dev, old_supply_vdd, 0, vdd_reg,
+				       "vdd");
+		if (ret)
+			return ret;
+	}
+
+	return ret;
+}
+
+static const struct ti_opp_supply_of_data omap_generic_of_data = {
+};
+
+static const struct ti_opp_supply_of_data omap_omap5_of_data = {
+	.flags = OPPDM_EFUSE_CLASS0_OPTIMIZED_VOLTAGE,
+	.efuse_voltage_mask = 0xFFF,
+	.efuse_voltage_uv = false,
+};
+
+static const struct ti_opp_supply_of_data omap_omap5core_of_data = {
+	.flags = OPPDM_EFUSE_CLASS0_OPTIMIZED_VOLTAGE | OPPDM_HAS_NO_ABB,
+	.efuse_voltage_mask = 0xFFF,
+	.efuse_voltage_uv = false,
+};
+
+static const struct of_device_id ti_opp_supply_of_match[] = {
+	{.compatible = "ti,omap-opp-supply", .data = &omap_generic_of_data},
+	{.compatible = "ti,omap5-opp-supply", .data = &omap_omap5_of_data},
+	{.compatible = "ti,omap5-core-opp-supply",
+	 .data = &omap_omap5core_of_data},
+	{},
+};
+MODULE_DEVICE_TABLE(of, ti_opp_supply_of_match);
+
+static int ti_opp_supply_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct device *cpu_dev = get_cpu_device(0);
+	const struct of_device_id *match;
+	const struct ti_opp_supply_of_data *of_data;
+	int ret = 0;
+
+	match = of_match_device(ti_opp_supply_of_match, dev);
+	if (!match) {
+		/* We do not expect this to happen */
+		dev_err(dev, "%s: Unable to match device\n", __func__);
+		return -ENODEV;
+	}
+	if (!match->data) {
+		/* Again, unlikely.. but mistakes do happen */
+		dev_err(dev, "%s: Bad data in match\n", __func__);
+		return -EINVAL;
+	}
+	of_data = match->data;
+
+	dev_set_drvdata(dev, (void *)of_data);
+
+	/* If we need optimized voltage */
+	if (of_data->flags & OPPDM_EFUSE_CLASS0_OPTIMIZED_VOLTAGE) {
+		ret = _store_optimized_voltages(dev, &opp_data);
+		if (ret)
+			return ret;
+	}
+
+	ret = PTR_ERR_OR_ZERO(dev_pm_opp_register_set_opp_helper(cpu_dev,
+								 ti_opp_supply_set_opp));
+	if (ret)
+		_free_optimized_voltages(dev, &opp_data);
+
+	return ret;
+}
+
+static struct platform_driver ti_opp_supply_driver = {
+	.probe = ti_opp_supply_probe,
+	.driver = {
+		   .name = "ti_opp_supply",
+		   .owner = THIS_MODULE,
+		   .of_match_table = of_match_ptr(ti_opp_supply_of_match),
+		   },
+};
+module_platform_driver(ti_opp_supply_driver);
+
+MODULE_DESCRIPTION("Texas Instruments OMAP OPP Supply driver");
+MODULE_AUTHOR("Texas Instruments Inc.");
+MODULE_LICENSE("GPL v2");

From 86ddd2db1f75a30f21a4c4de7a29249ee8c37ed8 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Mon, 18 Dec 2017 15:30:48 -0800
Subject: [PATCH 38/89] PM / wakeup: only recommend "call"ing
 device_init_wakeup() once

I'll admit admit it: I've written bad driver code that tries to
configure a device's wake IRQ without having called device_init_wakeup()
first. But do you really have to ask ask me twice?

Signed-off-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 38559f04db2c..cb72965b3281 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -298,7 +298,7 @@ int device_wakeup_attach_irq(struct device *dev,
 
 	ws = dev->power.wakeup;
 	if (!ws) {
-		dev_err(dev, "forgot to call call device_init_wakeup?\n");
+		dev_err(dev, "forgot to call device_init_wakeup?\n");
 		return -EINVAL;
 	}
 

From 972bc90ed8f6008f5e2d42c72d894b2401736519 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 18 Nov 2016 12:23:46 +0530
Subject: [PATCH 39/89] OPP: Allow OPP table to be used for power-domains

Power-domains can also have their active states and this patch enhances
the OPP binding to define those. The power domains can use the OPP
bindings as is, with one additional change to Allow
"operating-points-v2" property to contain multiple phandles for power
domain providers providing multiple domains.

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 Documentation/devicetree/bindings/opp/opp.txt            | 5 +++++
 Documentation/devicetree/bindings/power/power_domain.txt | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index 9d733af26be7..a3953a1bb1a1 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -45,6 +45,11 @@ Devices supporting OPPs must set their "operating-points-v2" property with
 phandle to a OPP table in their DT node. The OPP core will use this phandle to
 find the operating points for the device.
 
+This can contain more than one phandle for power domain providers that provide
+multiple power domains. That is, one phandle for each power domain. If only one
+phandle is available, then the same OPP table will be used for all power domains
+provided by the power domain provider.
+
 If required, this can be extended for SoC vendor specific bindings. Such bindings
 should be documented as Documentation/devicetree/bindings/power/<vendor>-opp.txt
 and should have a compatible description like: "operating-points-v2-<vendor>".
diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt
index 14bd9e945ff6..61549840ab3b 100644
--- a/Documentation/devicetree/bindings/power/power_domain.txt
+++ b/Documentation/devicetree/bindings/power/power_domain.txt
@@ -40,6 +40,12 @@ Optional properties:
   domain's idle states. In the absence of this property, the domain would be
   considered as capable of being powered-on or powered-off.
 
+- operating-points-v2 : Phandles to the OPP tables of power domains provided by
+  a power domain provider. If the provider provides a single power domain only
+  or all the power domains provided by the provider have identical OPP tables,
+  then this shall contain a single phandle. Refer to ../opp/opp.txt for more
+  information.
+
 Example:
 
 	power: power-controller@12340000 {

From e856f078bcf120e1627c750014d2612974ed81a2 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 18 Dec 2017 15:23:48 +0530
Subject: [PATCH 40/89] OPP: Introduce "required-opp" property

Devices have inter-dependencies some times. For example a device that
needs to run at 800 MHz, needs another device (e.g. Its power domain) to
be configured at a particular operating performance point.

This patch introduces a new property "required-opp" which can be present
directly in a device's node (if it doesn't need to change its OPPs), or
in device's OPP nodes. More details on the property can be seen in the
binding itself.

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 Documentation/devicetree/bindings/opp/opp.txt |  8 +++
 .../bindings/power/power_domain.txt           | 59 +++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index a3953a1bb1a1..4e4f30288c8b 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -159,6 +159,14 @@ Optional properties:
 
 - status: Marks the node enabled/disabled.
 
+- required-opp: This contains phandle to an OPP node in another device's OPP
+  table. It may contain an array of phandles, where each phandle points to an
+  OPP of a different device. It should not contain multiple phandles to the OPP
+  nodes in the same OPP table. This specifies the minimum required OPP of the
+  device(s), whose OPP's phandle is present in this property, for the
+  functioning of the current device at the current OPP (where this property is
+  present).
+
 Example 1: Single cluster Dual-core ARM cortex A9, switch DVFS states together.
 
 / {
diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt
index 61549840ab3b..f3355313c020 100644
--- a/Documentation/devicetree/bindings/power/power_domain.txt
+++ b/Documentation/devicetree/bindings/power/power_domain.txt
@@ -126,4 +126,63 @@ The node above defines a typical PM domain consumer device, which is located
 inside a PM domain with index 0 of a power controller represented by a node
 with the label "power".
 
+Optional properties:
+- required-opp: This contains phandle to an OPP node in another device's OPP
+  table. It may contain an array of phandles, where each phandle points to an
+  OPP of a different device. It should not contain multiple phandles to the OPP
+  nodes in the same OPP table. This specifies the minimum required OPP of the
+  device(s), whose OPP's phandle is present in this property, for the
+  functioning of the current device at the current OPP (where this property is
+  present).
+
+Example:
+- OPP table for domain provider that provides two domains.
+
+	domain0_opp_table: opp-table0 {
+		compatible = "operating-points-v2";
+
+		domain0_opp_0: opp-1000000000 {
+			opp-hz = /bits/ 64 <1000000000>;
+			opp-microvolt = <975000 970000 985000>;
+		};
+		domain0_opp_1: opp-1100000000 {
+			opp-hz = /bits/ 64 <1100000000>;
+			opp-microvolt = <1000000 980000 1010000>;
+		};
+	};
+
+	domain1_opp_table: opp-table1 {
+		compatible = "operating-points-v2";
+
+		domain1_opp_0: opp-1200000000 {
+			opp-hz = /bits/ 64 <1200000000>;
+			opp-microvolt = <975000 970000 985000>;
+		};
+		domain1_opp_1: opp-1300000000 {
+			opp-hz = /bits/ 64 <1300000000>;
+			opp-microvolt = <1000000 980000 1010000>;
+		};
+	};
+
+	power: power-controller@12340000 {
+		compatible = "foo,power-controller";
+		reg = <0x12340000 0x1000>;
+		#power-domain-cells = <1>;
+		operating-points-v2 = <&domain0_opp_table>, <&domain1_opp_table>;
+	};
+
+	leaky-device0@12350000 {
+		compatible = "foo,i-leak-current";
+		reg = <0x12350000 0x1000>;
+		power-domains = <&power 0>;
+		required-opp = <&domain0_opp_0>;
+	};
+
+	leaky-device1@12350000 {
+		compatible = "foo,i-leak-current";
+		reg = <0x12350000 0x1000>;
+		power-domains = <&power 1>;
+		required-opp = <&domain1_opp_1>;
+	};
+
 [1]. Documentation/devicetree/bindings/power/domain-idle-state.txt

From 3eff5f67a21997b4a86e0e5062fc72c2347e25bf Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 20 Dec 2017 05:44:40 +0000
Subject: [PATCH 41/89] PM / OPP: Make local function ti_opp_supply_set_opp()
 static

Fixes the following sparse warning:

drivers/opp/ti-opp-supply.c:276:5: warning:
 symbol 'ti_opp_supply_set_opp' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/opp/ti-opp-supply.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/opp/ti-opp-supply.c b/drivers/opp/ti-opp-supply.c
index 44dae3e51aac..370eff3acd8a 100644
--- a/drivers/opp/ti-opp-supply.c
+++ b/drivers/opp/ti-opp-supply.c
@@ -273,7 +273,7 @@ static int _opp_set_voltage(struct device *dev,
  *
  * Return: If successful, 0, else appropriate error value.
  */
-int ti_opp_supply_set_opp(struct dev_pm_set_opp_data *data)
+static int ti_opp_supply_set_opp(struct dev_pm_set_opp_data *data)
 {
 	struct dev_pm_opp_supply *old_supply_vdd = &data->old_opp.supplies[0];
 	struct dev_pm_opp_supply *old_supply_vbb = &data->old_opp.supplies[1];

From 2332bd04199353b06bf35f14f972d518907f08e0 Mon Sep 17 00:00:00 2001
From: Dong Aisheng <aisheng.dong@nxp.com>
Date: Sat, 23 Dec 2017 12:53:52 +0800
Subject: [PATCH 42/89] cpufreq: imx6q: switch to Use clk_bulk_get() to refine
 clk operations

Use clk_bulk_get() to simplify the driver's clocks handling.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Dong Aisheng <aisheng.dong@nxp.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/imx6q-cpufreq.c | 123 ++++++++++++++------------------
 1 file changed, 55 insertions(+), 68 deletions(-)

diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index d9b2c2de49c4..8bfb0775662b 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -25,15 +25,29 @@ static struct regulator *arm_reg;
 static struct regulator *pu_reg;
 static struct regulator *soc_reg;
 
-static struct clk *arm_clk;
-static struct clk *pll1_sys_clk;
-static struct clk *pll1_sw_clk;
-static struct clk *step_clk;
-static struct clk *pll2_pfd2_396m_clk;
+enum IMX6_CPUFREQ_CLKS {
+	ARM,
+	PLL1_SYS,
+	STEP,
+	PLL1_SW,
+	PLL2_PFD2_396M,
+	/* MX6UL requires two more clks */
+	PLL2_BUS,
+	SECONDARY_SEL,
+};
+#define IMX6Q_CPUFREQ_CLK_NUM		5
+#define IMX6UL_CPUFREQ_CLK_NUM		7
 
-/* clk used by i.MX6UL */
-static struct clk *pll2_bus_clk;
-static struct clk *secondary_sel_clk;
+static int num_clks;
+static struct clk_bulk_data clks[] = {
+	{ .id = "arm" },
+	{ .id = "pll1_sys" },
+	{ .id = "step" },
+	{ .id = "pll1_sw" },
+	{ .id = "pll2_pfd2_396m" },
+	{ .id = "pll2_bus" },
+	{ .id = "secondary_sel" },
+};
 
 static struct device *cpu_dev;
 static bool free_opp;
@@ -53,7 +67,7 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index)
 
 	new_freq = freq_table[index].frequency;
 	freq_hz = new_freq * 1000;
-	old_freq = clk_get_rate(arm_clk) / 1000;
+	old_freq = clk_get_rate(clks[ARM].clk) / 1000;
 
 	opp = dev_pm_opp_find_freq_ceil(cpu_dev, &freq_hz);
 	if (IS_ERR(opp)) {
@@ -112,29 +126,31 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index)
 		 * voltage of 528MHz, so lower the CPU frequency to one
 		 * half before changing CPU frequency.
 		 */
-		clk_set_rate(arm_clk, (old_freq >> 1) * 1000);
-		clk_set_parent(pll1_sw_clk, pll1_sys_clk);
-		if (freq_hz > clk_get_rate(pll2_pfd2_396m_clk))
-			clk_set_parent(secondary_sel_clk, pll2_bus_clk);
+		clk_set_rate(clks[ARM].clk, (old_freq >> 1) * 1000);
+		clk_set_parent(clks[PLL1_SW].clk, clks[PLL1_SYS].clk);
+		if (freq_hz > clk_get_rate(clks[PLL2_PFD2_396M].clk))
+			clk_set_parent(clks[SECONDARY_SEL].clk,
+				       clks[PLL2_BUS].clk);
 		else
-			clk_set_parent(secondary_sel_clk, pll2_pfd2_396m_clk);
-		clk_set_parent(step_clk, secondary_sel_clk);
-		clk_set_parent(pll1_sw_clk, step_clk);
+			clk_set_parent(clks[SECONDARY_SEL].clk,
+				       clks[PLL2_PFD2_396M].clk);
+		clk_set_parent(clks[STEP].clk, clks[SECONDARY_SEL].clk);
+		clk_set_parent(clks[PLL1_SW].clk, clks[STEP].clk);
 	} else {
-		clk_set_parent(step_clk, pll2_pfd2_396m_clk);
-		clk_set_parent(pll1_sw_clk, step_clk);
-		if (freq_hz > clk_get_rate(pll2_pfd2_396m_clk)) {
-			clk_set_rate(pll1_sys_clk, new_freq * 1000);
-			clk_set_parent(pll1_sw_clk, pll1_sys_clk);
+		clk_set_parent(clks[STEP].clk, clks[PLL2_PFD2_396M].clk);
+		clk_set_parent(clks[PLL1_SW].clk, clks[STEP].clk);
+		if (freq_hz > clk_get_rate(clks[PLL2_PFD2_396M].clk)) {
+			clk_set_rate(clks[PLL1_SYS].clk, new_freq * 1000);
+			clk_set_parent(clks[PLL1_SW].clk, clks[PLL1_SYS].clk);
 		} else {
 			/* pll1_sys needs to be enabled for divider rate change to work. */
 			pll1_sys_temp_enabled = true;
-			clk_prepare_enable(pll1_sys_clk);
+			clk_prepare_enable(clks[PLL1_SYS].clk);
 		}
 	}
 
 	/* Ensure the arm clock divider is what we expect */
-	ret = clk_set_rate(arm_clk, new_freq * 1000);
+	ret = clk_set_rate(clks[ARM].clk, new_freq * 1000);
 	if (ret) {
 		dev_err(cpu_dev, "failed to set clock rate: %d\n", ret);
 		regulator_set_voltage_tol(arm_reg, volt_old, 0);
@@ -143,7 +159,7 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index)
 
 	/* PLL1 is only needed until after ARM-PODF is set. */
 	if (pll1_sys_temp_enabled)
-		clk_disable_unprepare(pll1_sys_clk);
+		clk_disable_unprepare(clks[PLL1_SYS].clk);
 
 	/* scaling down?  scale voltage after frequency */
 	if (new_freq < old_freq) {
@@ -174,7 +190,7 @@ static int imx6q_cpufreq_init(struct cpufreq_policy *policy)
 {
 	int ret;
 
-	policy->clk = arm_clk;
+	policy->clk = clks[ARM].clk;
 	ret = cpufreq_generic_init(policy, freq_table, transition_latency);
 	policy->suspend_freq = policy->max;
 
@@ -266,28 +282,15 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev)
 		return -ENOENT;
 	}
 
-	arm_clk = clk_get(cpu_dev, "arm");
-	pll1_sys_clk = clk_get(cpu_dev, "pll1_sys");
-	pll1_sw_clk = clk_get(cpu_dev, "pll1_sw");
-	step_clk = clk_get(cpu_dev, "step");
-	pll2_pfd2_396m_clk = clk_get(cpu_dev, "pll2_pfd2_396m");
-	if (IS_ERR(arm_clk) || IS_ERR(pll1_sys_clk) || IS_ERR(pll1_sw_clk) ||
-	    IS_ERR(step_clk) || IS_ERR(pll2_pfd2_396m_clk)) {
-		dev_err(cpu_dev, "failed to get clocks\n");
-		ret = -ENOENT;
-		goto put_clk;
-	}
-
 	if (of_machine_is_compatible("fsl,imx6ul") ||
-	    of_machine_is_compatible("fsl,imx6ull")) {
-		pll2_bus_clk = clk_get(cpu_dev, "pll2_bus");
-		secondary_sel_clk = clk_get(cpu_dev, "secondary_sel");
-		if (IS_ERR(pll2_bus_clk) || IS_ERR(secondary_sel_clk)) {
-			dev_err(cpu_dev, "failed to get clocks specific to imx6ul\n");
-			ret = -ENOENT;
-			goto put_clk;
-		}
-	}
+	    of_machine_is_compatible("fsl,imx6ull"))
+		num_clks = IMX6UL_CPUFREQ_CLK_NUM;
+	else
+		num_clks = IMX6Q_CPUFREQ_CLK_NUM;
+
+	ret = clk_bulk_get(cpu_dev, num_clks, clks);
+	if (ret)
+		goto put_node;
 
 	arm_reg = regulator_get(cpu_dev, "arm");
 	pu_reg = regulator_get_optional(cpu_dev, "pu");
@@ -424,22 +427,11 @@ put_reg:
 		regulator_put(pu_reg);
 	if (!IS_ERR(soc_reg))
 		regulator_put(soc_reg);
-put_clk:
-	if (!IS_ERR(arm_clk))
-		clk_put(arm_clk);
-	if (!IS_ERR(pll1_sys_clk))
-		clk_put(pll1_sys_clk);
-	if (!IS_ERR(pll1_sw_clk))
-		clk_put(pll1_sw_clk);
-	if (!IS_ERR(step_clk))
-		clk_put(step_clk);
-	if (!IS_ERR(pll2_pfd2_396m_clk))
-		clk_put(pll2_pfd2_396m_clk);
-	if (!IS_ERR(pll2_bus_clk))
-		clk_put(pll2_bus_clk);
-	if (!IS_ERR(secondary_sel_clk))
-		clk_put(secondary_sel_clk);
+
+	clk_bulk_put(num_clks, clks);
+put_node:
 	of_node_put(np);
+
 	return ret;
 }
 
@@ -453,13 +445,8 @@ static int imx6q_cpufreq_remove(struct platform_device *pdev)
 	if (!IS_ERR(pu_reg))
 		regulator_put(pu_reg);
 	regulator_put(soc_reg);
-	clk_put(arm_clk);
-	clk_put(pll1_sys_clk);
-	clk_put(pll1_sw_clk);
-	clk_put(step_clk);
-	clk_put(pll2_pfd2_396m_clk);
-	clk_put(pll2_bus_clk);
-	clk_put(secondary_sel_clk);
+
+	clk_bulk_put(num_clks, clks);
 
 	return 0;
 }

From d1bf2d30728f310f72296b54f0651ecdb09cbb12 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Sun, 5 Nov 2017 21:27:41 -0800
Subject: [PATCH 43/89] PM / devfreq: Propagate error from devfreq_add_device()

Propagate the error of devfreq_add_device() in devm_devfreq_add_device()
rather than statically returning ENOMEM. This makes it slightly faster
to pinpoint the cause of a returned error.

Fixes: 8cd84092d35e ("PM / devfreq: Add resource-managed function for devfreq device")
Cc: stable@vger.kernel.org
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/devfreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 78fb496ecb4e..99c4021fc33b 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -737,7 +737,7 @@ struct devfreq *devm_devfreq_add_device(struct device *dev,
 	devfreq = devfreq_add_device(dev, profile, governor_name, data);
 	if (IS_ERR(devfreq)) {
 		devres_free(ptr);
-		return ERR_PTR(-ENOMEM);
+		return devfreq;
 	}
 
 	*ptr = devfreq;

From 63f1e05f7fe9ca509c60154d6a833abf96eecdc9 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Wed, 6 Dec 2017 14:20:15 -0600
Subject: [PATCH 44/89] PM / devfreq: Fix potential NULL pointer dereference in
 governor_store

df->governor is being dereferenced before it is null checked,
hence there is a potential null pointer dereference.

Notice that df->governor is being null checked at line 1004:
if (df->governor) {, which implies it might be null.

Fix this by null checking df->governor before dereferencing it.

Addresses-Coverity-ID: 1401988 ("Dereference before null check")
Fixes: bcf23c79c4e4 ("PM / devfreq: Fix available_governor sysfs")
Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/devfreq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 99c4021fc33b..fe2af6aa88fc 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -996,7 +996,8 @@ static ssize_t governor_store(struct device *dev, struct device_attribute *attr,
 	if (df->governor == governor) {
 		ret = 0;
 		goto out;
-	} else if (df->governor->immutable || governor->immutable) {
+	} else if ((df->governor && df->governor->immutable) ||
+					governor->immutable) {
 		ret = -EINVAL;
 		goto out;
 	}

From d97c2e0d635e39b5b63784deb3212e846ebf76dc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 26 Dec 2017 01:50:20 +0100
Subject: [PATCH 45/89] PM / wakeup: Drop redundant check from
 device_set_wakeup_enable()

Since both device_wakeup_enable() and device_wakeup_disable() check
if dev is not NULL and whether or not power.can_wakeup is set for it,
device_set_wakeup_enable() doesn't have to do that, so drop that
check from it.

No intentional changes in functionality.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/base/power/wakeup.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index cb72965b3281..90c7212de087 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -464,9 +464,6 @@ EXPORT_SYMBOL_GPL(device_init_wakeup);
  */
 int device_set_wakeup_enable(struct device *dev, bool enable)
 {
-	if (!dev || !dev->power.can_wakeup)
-		return -EINVAL;
-
 	return enable ? device_wakeup_enable(dev) : device_wakeup_disable(dev);
 }
 EXPORT_SYMBOL_GPL(device_set_wakeup_enable);

From 9dbc64a5d5938b990a045509ff5356fc53e4abd4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 2 Jan 2018 01:42:56 +0100
Subject: [PATCH 46/89] PM / wakeup: Drop redundant check from
 device_init_wakeup()

Since device_wakeup_disable() checks the device's power.can_wakeup
flag, device_init_wakeup() doesn't need to do that before calling it,
so drop that redundant check from device_init_wakeup().

No intentional changes in functionality.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/base/power/wakeup.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 90c7212de087..b7b8b2fe89c6 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -448,9 +448,7 @@ int device_init_wakeup(struct device *dev, bool enable)
 		device_set_wakeup_capable(dev, true);
 		ret = device_wakeup_enable(dev);
 	} else {
-		if (dev->power.can_wakeup)
-			device_wakeup_disable(dev);
-
+		device_wakeup_disable(dev);
 		device_set_wakeup_capable(dev, false);
 	}
 

From 4fa3061a6856cc72f3f984702145bb30f16ee40e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 10 Dec 2017 00:58:18 +0100
Subject: [PATCH 47/89] PM / core: Add helpers for subsystem callback selection

Add helper routines to find and return a suitable subsystem callback
during the "noirq" phases of system suspend/resume (or analogous)
transitions as well as during the "late" phase of system suspend and
the "early" phase of system resume (or analogous) transitions.

The helpers will be called from additional sites going forward.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 drivers/base/power/main.c | 222 +++++++++++++++++++++++++-------------
 1 file changed, 145 insertions(+), 77 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 6e8cc5de93fd..3c5fdf155c91 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -551,31 +551,12 @@ bool dev_pm_may_skip_resume(struct device *dev)
 	return !dev->power.must_resume && pm_transition.event != PM_EVENT_RESTORE;
 }
 
-/**
- * device_resume_noirq - Execute a "noirq resume" callback for given device.
- * @dev: Device to handle.
- * @state: PM transition of the system being carried out.
- * @async: If true, the device is being resumed asynchronously.
- *
- * The driver of @dev will not receive interrupts while this function is being
- * executed.
- */
-static int device_resume_noirq(struct device *dev, pm_message_t state, bool async)
+static pm_callback_t dpm_subsys_resume_noirq_cb(struct device *dev,
+						pm_message_t state,
+						const char **info_p)
 {
-	pm_callback_t callback = NULL;
-	const char *info = NULL;
-	int error = 0;
-
-	TRACE_DEVICE(dev);
-	TRACE_RESUME(0);
-
-	if (dev->power.syscore || dev->power.direct_complete)
-		goto Out;
-
-	if (!dev->power.is_noirq_suspended)
-		goto Out;
-
-	dpm_wait_for_superior(dev, async);
+	pm_callback_t callback;
+	const char *info;
 
 	if (dev->pm_domain) {
 		info = "noirq power domain ";
@@ -589,8 +570,44 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn
 	} else if (dev->bus && dev->bus->pm) {
 		info = "noirq bus ";
 		callback = pm_noirq_op(dev->bus->pm, state);
+	} else {
+		return NULL;
 	}
 
+	if (info_p)
+		*info_p = info;
+
+	return callback;
+}
+
+/**
+ * device_resume_noirq - Execute a "noirq resume" callback for given device.
+ * @dev: Device to handle.
+ * @state: PM transition of the system being carried out.
+ * @async: If true, the device is being resumed asynchronously.
+ *
+ * The driver of @dev will not receive interrupts while this function is being
+ * executed.
+ */
+static int device_resume_noirq(struct device *dev, pm_message_t state, bool async)
+{
+	pm_callback_t callback;
+	const char *info;
+	int error = 0;
+
+	TRACE_DEVICE(dev);
+	TRACE_RESUME(0);
+
+	if (dev->power.syscore || dev->power.direct_complete)
+		goto Out;
+
+	if (!dev->power.is_noirq_suspended)
+		goto Out;
+
+	dpm_wait_for_superior(dev, async);
+
+	callback = dpm_subsys_resume_noirq_cb(dev, state, &info);
+
 	if (!callback && dev->driver && dev->driver->pm) {
 		info = "noirq driver ";
 		callback = pm_noirq_op(dev->driver->pm, state);
@@ -704,30 +721,12 @@ void dpm_resume_noirq(pm_message_t state)
 	dpm_noirq_end();
 }
 
-/**
- * device_resume_early - Execute an "early resume" callback for given device.
- * @dev: Device to handle.
- * @state: PM transition of the system being carried out.
- * @async: If true, the device is being resumed asynchronously.
- *
- * Runtime PM is disabled for @dev while this function is being executed.
- */
-static int device_resume_early(struct device *dev, pm_message_t state, bool async)
+static pm_callback_t dpm_subsys_resume_early_cb(struct device *dev,
+						pm_message_t state,
+						const char **info_p)
 {
-	pm_callback_t callback = NULL;
-	const char *info = NULL;
-	int error = 0;
-
-	TRACE_DEVICE(dev);
-	TRACE_RESUME(0);
-
-	if (dev->power.syscore || dev->power.direct_complete)
-		goto Out;
-
-	if (!dev->power.is_late_suspended)
-		goto Out;
-
-	dpm_wait_for_superior(dev, async);
+	pm_callback_t callback;
+	const char *info;
 
 	if (dev->pm_domain) {
 		info = "early power domain ";
@@ -741,8 +740,43 @@ static int device_resume_early(struct device *dev, pm_message_t state, bool asyn
 	} else if (dev->bus && dev->bus->pm) {
 		info = "early bus ";
 		callback = pm_late_early_op(dev->bus->pm, state);
+	} else {
+		return NULL;
 	}
 
+	if (info_p)
+		*info_p = info;
+
+	return callback;
+}
+
+/**
+ * device_resume_early - Execute an "early resume" callback for given device.
+ * @dev: Device to handle.
+ * @state: PM transition of the system being carried out.
+ * @async: If true, the device is being resumed asynchronously.
+ *
+ * Runtime PM is disabled for @dev while this function is being executed.
+ */
+static int device_resume_early(struct device *dev, pm_message_t state, bool async)
+{
+	pm_callback_t callback;
+	const char *info;
+	int error = 0;
+
+	TRACE_DEVICE(dev);
+	TRACE_RESUME(0);
+
+	if (dev->power.syscore || dev->power.direct_complete)
+		goto Out;
+
+	if (!dev->power.is_late_suspended)
+		goto Out;
+
+	dpm_wait_for_superior(dev, async);
+
+	callback = dpm_subsys_resume_early_cb(dev, state, &info);
+
 	if (!callback && dev->driver && dev->driver->pm) {
 		info = "early driver ";
 		callback = pm_late_early_op(dev->driver->pm, state);
@@ -1128,6 +1162,35 @@ static void dpm_superior_set_must_resume(struct device *dev)
 	device_links_read_unlock(idx);
 }
 
+static pm_callback_t dpm_subsys_suspend_noirq_cb(struct device *dev,
+						 pm_message_t state,
+						 const char **info_p)
+{
+	pm_callback_t callback;
+	const char *info;
+
+	if (dev->pm_domain) {
+		info = "noirq power domain ";
+		callback = pm_noirq_op(&dev->pm_domain->ops, state);
+	} else if (dev->type && dev->type->pm) {
+		info = "noirq type ";
+		callback = pm_noirq_op(dev->type->pm, state);
+	} else if (dev->class && dev->class->pm) {
+		info = "noirq class ";
+		callback = pm_noirq_op(dev->class->pm, state);
+	} else if (dev->bus && dev->bus->pm) {
+		info = "noirq bus ";
+		callback = pm_noirq_op(dev->bus->pm, state);
+	} else {
+		return NULL;
+	}
+
+	if (info_p)
+		*info_p = info;
+
+	return callback;
+}
+
 /**
  * __device_suspend_noirq - Execute a "noirq suspend" callback for given device.
  * @dev: Device to handle.
@@ -1139,8 +1202,8 @@ static void dpm_superior_set_must_resume(struct device *dev)
  */
 static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool async)
 {
-	pm_callback_t callback = NULL;
-	const char *info = NULL;
+	pm_callback_t callback;
+	const char *info;
 	int error = 0;
 
 	TRACE_DEVICE(dev);
@@ -1159,19 +1222,7 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 	if (dev->power.syscore || dev->power.direct_complete)
 		goto Complete;
 
-	if (dev->pm_domain) {
-		info = "noirq power domain ";
-		callback = pm_noirq_op(&dev->pm_domain->ops, state);
-	} else if (dev->type && dev->type->pm) {
-		info = "noirq type ";
-		callback = pm_noirq_op(dev->type->pm, state);
-	} else if (dev->class && dev->class->pm) {
-		info = "noirq class ";
-		callback = pm_noirq_op(dev->class->pm, state);
-	} else if (dev->bus && dev->bus->pm) {
-		info = "noirq bus ";
-		callback = pm_noirq_op(dev->bus->pm, state);
-	}
+	callback = dpm_subsys_suspend_noirq_cb(dev, state, &info);
 
 	if (!callback && dev->driver && dev->driver->pm) {
 		info = "noirq driver ";
@@ -1306,6 +1357,35 @@ int dpm_suspend_noirq(pm_message_t state)
 	return ret;
 }
 
+static pm_callback_t dpm_subsys_suspend_late_cb(struct device *dev,
+						pm_message_t state,
+						const char **info_p)
+{
+	pm_callback_t callback;
+	const char *info;
+
+	if (dev->pm_domain) {
+		info = "late power domain ";
+		callback = pm_late_early_op(&dev->pm_domain->ops, state);
+	} else if (dev->type && dev->type->pm) {
+		info = "late type ";
+		callback = pm_late_early_op(dev->type->pm, state);
+	} else if (dev->class && dev->class->pm) {
+		info = "late class ";
+		callback = pm_late_early_op(dev->class->pm, state);
+	} else if (dev->bus && dev->bus->pm) {
+		info = "late bus ";
+		callback = pm_late_early_op(dev->bus->pm, state);
+	} else {
+		return NULL;
+	}
+
+	if (info_p)
+		*info_p = info;
+
+	return callback;
+}
+
 /**
  * __device_suspend_late - Execute a "late suspend" callback for given device.
  * @dev: Device to handle.
@@ -1316,8 +1396,8 @@ int dpm_suspend_noirq(pm_message_t state)
  */
 static int __device_suspend_late(struct device *dev, pm_message_t state, bool async)
 {
-	pm_callback_t callback = NULL;
-	const char *info = NULL;
+	pm_callback_t callback;
+	const char *info;
 	int error = 0;
 
 	TRACE_DEVICE(dev);
@@ -1338,19 +1418,7 @@ static int __device_suspend_late(struct device *dev, pm_message_t state, bool as
 	if (dev->power.syscore || dev->power.direct_complete)
 		goto Complete;
 
-	if (dev->pm_domain) {
-		info = "late power domain ";
-		callback = pm_late_early_op(&dev->pm_domain->ops, state);
-	} else if (dev->type && dev->type->pm) {
-		info = "late type ";
-		callback = pm_late_early_op(dev->type->pm, state);
-	} else if (dev->class && dev->class->pm) {
-		info = "late class ";
-		callback = pm_late_early_op(dev->class->pm, state);
-	} else if (dev->bus && dev->bus->pm) {
-		info = "late bus ";
-		callback = pm_late_early_op(dev->bus->pm, state);
-	}
+	callback = dpm_subsys_suspend_late_cb(dev, state, &info);
 
 	if (!callback && dev->driver && dev->driver->pm) {
 		info = "late driver ";

From 75e94645fc3b1007eacb4c7863059f8e8d098cda Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 10 Dec 2017 01:00:45 +0100
Subject: [PATCH 48/89] PM / core: Direct DPM_FLAG_SMART_SUSPEND optimization

Make the PM core avoid invoking the "late" and "noirq" system-wide
suspend (or analogous) callbacks provided by device drivers directly
for devices with DPM_FLAG_SMART_SUSPEND set that are in runtime
suspend during the "late" and "noirq" phases of system-wide suspend
(or analogous) transitions.  That is only done for devices without
any middle-layer "late" and "noirq" suspend callbacks (to avoid
confusing the middle layer if there is one).

The underlying observation is that runtime PM is disabled for devices
during the "late" and "noirq" system-wide suspend phases, so if they
remain in runtime suspend from the "late" phase forward, it doesn't
make sense to invoke the "late" and "noirq" callbacks provided by
the drivers for them (arguably, the device is already suspended and
in the right state).  Thus, if the remaining driver suspend callbacks
are to be invoked directly by the core, they can be skipped.

This change really makes it possible for, say, platform device
drivers to re-use runtime PM suspend and resume callbacks by
pointing ->suspend_late and ->resume_early, respectively (and
possibly the analogous hibernation-related callback pointers too),
to them without adding any extra "is the device already suspended?"
type of checks to the callback routines, as long as they will be
invoked directly by the core.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/driver-api/pm/devices.rst | 18 +++---
 drivers/base/power/main.c               | 85 +++++++++++++++++++++++--
 2 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
index b0fe63c91f8d..07026811dcae 100644
--- a/Documentation/driver-api/pm/devices.rst
+++ b/Documentation/driver-api/pm/devices.rst
@@ -777,14 +777,16 @@ The driver can indicate that by setting ``DPM_FLAG_SMART_SUSPEND`` in
 runtime suspend at the beginning of the ``suspend_late`` phase of system-wide
 suspend (or in the ``poweroff_late`` phase of hibernation), when runtime PM
 has been disabled for it, under the assumption that its state should not change
-after that point until the system-wide transition is over.  If that happens, the
-driver's system-wide resume callbacks, if present, may still be invoked during
-the subsequent system-wide resume transition and the device's runtime power
-management status may be set to "active" before enabling runtime PM for it,
-so the driver must be prepared to cope with the invocation of its system-wide
-resume callbacks back-to-back with its ``->runtime_suspend`` one (without the
-intervening ``->runtime_resume`` and so on) and the final state of the device
-must reflect the "active" status for runtime PM in that case.
+after that point until the system-wide transition is over (the PM core itself
+does that for devices whose "noirq", "late" and "early" system-wide PM callbacks
+are executed directly by it).  If that happens, the driver's system-wide resume
+callbacks, if present, may still be invoked during the subsequent system-wide
+resume transition and the device's runtime power management status may be set
+to "active" before enabling runtime PM for it, so the driver must be prepared to
+cope with the invocation of its system-wide resume callbacks back-to-back with
+its ``->runtime_suspend`` one (without the intervening ``->runtime_resume`` and
+so on) and the final state of the device must reflect the "active" runtime PM
+status in that case.
 
 During system-wide resume from a sleep state it's easiest to put devices into
 the full-power state, as explained in :file:`Documentation/power/runtime_pm.txt`.
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 3c5fdf155c91..154f7b4db8d0 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -539,6 +539,24 @@ void dev_pm_skip_next_resume_phases(struct device *dev)
 	dev->power.is_suspended = false;
 }
 
+/**
+ * suspend_event - Return a "suspend" message for given "resume" one.
+ * @resume_msg: PM message representing a system-wide resume transition.
+ */
+static pm_message_t suspend_event(pm_message_t resume_msg)
+{
+	switch (resume_msg.event) {
+	case PM_EVENT_RESUME:
+		return PMSG_SUSPEND;
+	case PM_EVENT_THAW:
+	case PM_EVENT_RESTORE:
+		return PMSG_FREEZE;
+	case PM_EVENT_RECOVER:
+		return PMSG_HIBERNATE;
+	}
+	return PMSG_ON;
+}
+
 /**
  * dev_pm_may_skip_resume - System-wide device resume optimization check.
  * @dev: Target device.
@@ -580,6 +598,14 @@ static pm_callback_t dpm_subsys_resume_noirq_cb(struct device *dev,
 	return callback;
 }
 
+static pm_callback_t dpm_subsys_suspend_noirq_cb(struct device *dev,
+						 pm_message_t state,
+						 const char **info_p);
+
+static pm_callback_t dpm_subsys_suspend_late_cb(struct device *dev,
+						pm_message_t state,
+						const char **info_p);
+
 /**
  * device_resume_noirq - Execute a "noirq resume" callback for given device.
  * @dev: Device to handle.
@@ -607,13 +633,40 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn
 	dpm_wait_for_superior(dev, async);
 
 	callback = dpm_subsys_resume_noirq_cb(dev, state, &info);
+	if (callback)
+		goto Run;
 
-	if (!callback && dev->driver && dev->driver->pm) {
+	if (dev_pm_smart_suspend_and_suspended(dev)) {
+		pm_message_t suspend_msg = suspend_event(state);
+
+		/*
+		 * If "freeze" callbacks have been skipped during a transition
+		 * related to hibernation, the subsequent "thaw" callbacks must
+		 * be skipped too or bad things may happen.  Otherwise, resume
+		 * callbacks are going to be run for the device, so its runtime
+		 * PM status must be changed to reflect the new state after the
+		 * transition under way.
+		 */
+		if (!dpm_subsys_suspend_late_cb(dev, suspend_msg, NULL) &&
+		    !dpm_subsys_suspend_noirq_cb(dev, suspend_msg, NULL)) {
+			if (state.event == PM_EVENT_THAW) {
+				dev_pm_skip_next_resume_phases(dev);
+				goto Skip;
+			} else {
+				pm_runtime_set_active(dev);
+			}
+		}
+	}
+
+	if (dev->driver && dev->driver->pm) {
 		info = "noirq driver ";
 		callback = pm_noirq_op(dev->driver->pm, state);
 	}
 
+Run:
 	error = dpm_run_callback(callback, dev, state, info);
+
+Skip:
 	dev->power.is_noirq_suspended = false;
 
 	if (dev_pm_may_skip_resume(dev)) {
@@ -628,7 +681,7 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn
 		dev_pm_skip_next_resume_phases(dev);
 	}
 
- Out:
+Out:
 	complete_all(&dev->power.completion);
 	TRACE_RESUME(error);
 	return error;
@@ -1223,18 +1276,26 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 		goto Complete;
 
 	callback = dpm_subsys_suspend_noirq_cb(dev, state, &info);
+	if (callback)
+		goto Run;
 
-	if (!callback && dev->driver && dev->driver->pm) {
+	if (dev_pm_smart_suspend_and_suspended(dev) &&
+	    !dpm_subsys_suspend_late_cb(dev, state, NULL))
+		goto Skip;
+
+	if (dev->driver && dev->driver->pm) {
 		info = "noirq driver ";
 		callback = pm_noirq_op(dev->driver->pm, state);
 	}
 
+Run:
 	error = dpm_run_callback(callback, dev, state, info);
 	if (error) {
 		async_error = error;
 		goto Complete;
 	}
 
+Skip:
 	dev->power.is_noirq_suspended = true;
 
 	if (dev_pm_test_driver_flags(dev, DPM_FLAG_LEAVE_SUSPENDED)) {
@@ -1419,17 +1480,27 @@ static int __device_suspend_late(struct device *dev, pm_message_t state, bool as
 		goto Complete;
 
 	callback = dpm_subsys_suspend_late_cb(dev, state, &info);
+	if (callback)
+		goto Run;
 
-	if (!callback && dev->driver && dev->driver->pm) {
+	if (dev_pm_smart_suspend_and_suspended(dev) &&
+	    !dpm_subsys_suspend_noirq_cb(dev, state, NULL))
+		goto Skip;
+
+	if (dev->driver && dev->driver->pm) {
 		info = "late driver ";
 		callback = pm_late_early_op(dev->driver->pm, state);
 	}
 
+Run:
 	error = dpm_run_callback(callback, dev, state, info);
-	if (!error)
-		dev->power.is_late_suspended = true;
-	else
+	if (error) {
 		async_error = error;
+		goto Complete;
+	}
+
+Skip:
+	dev->power.is_late_suspended = true;
 
 Complete:
 	TRACE_SUSPEND(error);

From 32bfa56ac158c1ebcc82df2518860f824be5e5be Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 10 Dec 2017 01:02:13 +0100
Subject: [PATCH 49/89] PM / core: Direct DPM_FLAG_LEAVE_SUSPENDED handling

Make the PM core handle DPM_FLAG_LEAVE_SUSPENDED directly for
devices whose "noirq", "late" and "early" driver callbacks are
invoked directly by it.

Namely, make it skip all of the system-wide resume callbacks for
such devices with DPM_FLAG_LEAVE_SUSPENDED set if they are in
runtime suspend during the "noirq" phase of system-wide suspend
(or analogous) transitions or the system transition under way is
a proper suspend (rather than anything related to hibernation) and
the device's wakeup settings are compatible with runtime PM (that
is, the device cannot generate wakeup signals at all or it is
allowed to wake up the system from sleep).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/driver-api/pm/devices.rst |  9 +++++
 drivers/base/power/main.c               | 51 +++++++++++++++++++------
 2 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
index 07026811dcae..1128705a5731 100644
--- a/Documentation/driver-api/pm/devices.rst
+++ b/Documentation/driver-api/pm/devices.rst
@@ -816,3 +816,12 @@ appropriate in its "noirq" resume callback, which is executed regardless of
 whether or not the device is left suspended, but the other resume callbacks
 (except for ``->complete``) will be skipped automatically by the PM core if the
 device really can be left in suspend.
+
+For devices whose "noirq", "late" and "early" driver callbacks are invoked
+directly by the PM core, all of the system-wide resume callbacks are skipped if
+``DPM_FLAG_LEAVE_SUSPENDED`` is set and the device is in runtime suspend during
+the ``suspend_noirq`` (or analogous) phase or the transition under way is a
+proper system suspend (rather than anything related to hibernation) and the
+device's wakeup settings are suitable for runtime PM (that is, it cannot
+generate wakeup signals at all or it is allowed to wake up the system from
+sleep).
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 154f7b4db8d0..70398e7b3569 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -619,6 +619,7 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn
 {
 	pm_callback_t callback;
 	const char *info;
+	bool skip_resume;
 	int error = 0;
 
 	TRACE_DEVICE(dev);
@@ -632,10 +633,15 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn
 
 	dpm_wait_for_superior(dev, async);
 
+	skip_resume = dev_pm_may_skip_resume(dev);
+
 	callback = dpm_subsys_resume_noirq_cb(dev, state, &info);
 	if (callback)
 		goto Run;
 
+	if (skip_resume)
+		goto Skip;
+
 	if (dev_pm_smart_suspend_and_suspended(dev)) {
 		pm_message_t suspend_msg = suspend_event(state);
 
@@ -650,7 +656,7 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn
 		if (!dpm_subsys_suspend_late_cb(dev, suspend_msg, NULL) &&
 		    !dpm_subsys_suspend_noirq_cb(dev, suspend_msg, NULL)) {
 			if (state.event == PM_EVENT_THAW) {
-				dev_pm_skip_next_resume_phases(dev);
+				skip_resume = true;
 				goto Skip;
 			} else {
 				pm_runtime_set_active(dev);
@@ -669,7 +675,7 @@ Run:
 Skip:
 	dev->power.is_noirq_suspended = false;
 
-	if (dev_pm_may_skip_resume(dev)) {
+	if (skip_resume) {
 		/*
 		 * The device is going to be left in suspend, but it might not
 		 * have been in runtime suspend before the system suspended, so
@@ -1244,6 +1250,32 @@ static pm_callback_t dpm_subsys_suspend_noirq_cb(struct device *dev,
 	return callback;
 }
 
+static bool device_must_resume(struct device *dev, pm_message_t state,
+			       bool no_subsys_suspend_noirq)
+{
+	pm_message_t resume_msg = resume_event(state);
+
+	/*
+	 * If all of the device driver's "noirq", "late" and "early" callbacks
+	 * are invoked directly by the core, the decision to allow the device to
+	 * stay in suspend can be based on its current runtime PM status and its
+	 * wakeup settings.
+	 */
+	if (no_subsys_suspend_noirq &&
+	    !dpm_subsys_suspend_late_cb(dev, state, NULL) &&
+	    !dpm_subsys_resume_early_cb(dev, resume_msg, NULL) &&
+	    !dpm_subsys_resume_noirq_cb(dev, resume_msg, NULL))
+		return !pm_runtime_status_suspended(dev) &&
+			(resume_msg.event != PM_EVENT_RESUME ||
+			 (device_can_wakeup(dev) && !device_may_wakeup(dev)));
+
+	/*
+	 * The only safe strategy here is to require that if the device may not
+	 * be left in suspend, resume callbacks must be invoked for it.
+	 */
+	return !dev->power.may_skip_resume;
+}
+
 /**
  * __device_suspend_noirq - Execute a "noirq suspend" callback for given device.
  * @dev: Device to handle.
@@ -1257,6 +1289,7 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 {
 	pm_callback_t callback;
 	const char *info;
+	bool no_subsys_cb = false;
 	int error = 0;
 
 	TRACE_DEVICE(dev);
@@ -1279,8 +1312,9 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 	if (callback)
 		goto Run;
 
-	if (dev_pm_smart_suspend_and_suspended(dev) &&
-	    !dpm_subsys_suspend_late_cb(dev, state, NULL))
+	no_subsys_cb = !dpm_subsys_suspend_late_cb(dev, state, NULL);
+
+	if (dev_pm_smart_suspend_and_suspended(dev) && no_subsys_cb)
 		goto Skip;
 
 	if (dev->driver && dev->driver->pm) {
@@ -1299,14 +1333,9 @@ Skip:
 	dev->power.is_noirq_suspended = true;
 
 	if (dev_pm_test_driver_flags(dev, DPM_FLAG_LEAVE_SUSPENDED)) {
-		/*
-		 * The only safe strategy here is to require that if the device
-		 * may not be left in suspend, resume callbacks must be invoked
-		 * for it.
-		 */
 		dev->power.must_resume = dev->power.must_resume ||
-					!dev->power.may_skip_resume ||
-					atomic_read(&dev->power.usage_count) > 1;
+				atomic_read(&dev->power.usage_count) > 1 ||
+				device_must_resume(dev, state, no_subsys_cb);
 	} else {
 		dev->power.must_resume = true;
 	}

From ac89c400ebb146604e718b3fa168c15592e73a8c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 2 Jan 2018 10:51:34 +0530
Subject: [PATCH 50/89] cpu_cooling: Remove static-power related documentation

commit 84fe2cab4859 ("cpu_cooling: Drop static-power related stuff")
removed support for static-power in kernel, but it missed reflecting the
same in documentation. Remove the static power related documentation
bits as well.

Reported-by: Javi Merino <javi.merino@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/thermal/cpu-cooling-api.txt | 82 +----------------------
 1 file changed, 2 insertions(+), 80 deletions(-)

diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt
index 7a1c89db0419..7df567eaea1a 100644
--- a/Documentation/thermal/cpu-cooling-api.txt
+++ b/Documentation/thermal/cpu-cooling-api.txt
@@ -44,16 +44,14 @@ the user. The registration APIs returns the cooling device pointer.
 2. Power models
 
 The power API registration functions provide a simple power model for
-CPUs.  The current power is calculated as dynamic + (optionally)
-static power.  This power model requires that the operating-points of
+CPUs.  The current power is calculated as dynamic power (static power isn't
+supported currently).  This power model requires that the operating-points of
 the CPUs are registered using the kernel's opp library and the
 `cpufreq_frequency_table` is assigned to the `struct device` of the
 cpu.  If you are using CONFIG_CPUFREQ_DT then the
 `cpufreq_frequency_table` should already be assigned to the cpu
 device.
 
-2.1 Dynamic power
-
 The dynamic power consumption of a processor depends on many factors.
 For a given processor implementation the primary factors are:
 
@@ -92,79 +90,3 @@ mW/MHz/uVolt^2.  Typical values for mobile CPUs might lie in range
 from 100 to 500.  For reference, the approximate values for the SoC in
 ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
 140 for the Cortex-A53 cluster.
-
-
-2.2 Static power
-
-Static leakage power consumption depends on a number of factors.  For a
-given circuit implementation the primary factors are:
-
-- Time the circuit spends in each 'power state'
-- Temperature
-- Operating voltage
-- Process grade
-
-The time the circuit spends in each 'power state' for a given
-evaluation period at first order means OFF or ON.  However,
-'retention' states can also be supported that reduce power during
-inactive periods without loss of context.
-
-Note: The visibility of state entries to the OS can vary, according to
-platform specifics, and this can then impact the accuracy of a model
-based on OS state information alone.  It might be possible in some
-cases to extract more accurate information from system resources.
-
-The temperature, operating voltage and process 'grade' (slow to fast)
-of the circuit are all significant factors in static leakage power
-consumption.  All of these have complex relationships to static power.
-
-Circuit implementation specific factors include the chosen silicon
-process as well as the type, number and size of transistors in both
-the logic gates and any RAM elements included.
-
-The static power consumption modelling must take into account the
-power managed regions that are implemented.  Taking the example of an
-ARM processor cluster, the modelling would take into account whether
-each CPU can be powered OFF separately or if only a single power
-region is implemented for the complete cluster.
-
-In one view, there are others, a static power consumption model can
-then start from a set of reference values for each power managed
-region (e.g. CPU, Cluster/L2) in each state (e.g. ON, OFF) at an
-arbitrary process grade, voltage and temperature point.  These values
-are then scaled for all of the following: the time in each state, the
-process grade, the current temperature and the operating voltage.
-However, since both implementation specific and complex relationships
-dominate the estimate, the appropriate interface to the model from the
-cpu cooling device is to provide a function callback that calculates
-the static power in this platform.  When registering the cpu cooling
-device pass a function pointer that follows the `get_static_t`
-prototype:
-
-    int plat_get_static(cpumask_t *cpumask, int interval,
-                        unsigned long voltage, u32 &power);
-
-`cpumask` is the cpumask of the cpus involved in the calculation.
-`voltage` is the voltage at which they are operating.  The function
-should calculate the average static power for the last `interval`
-milliseconds.  It returns 0 on success, -E* on error.  If it
-succeeds, it should store the static power in `power`.  Reading the
-temperature of the cpus described by `cpumask` is left for
-plat_get_static() to do as the platform knows best which thermal
-sensor is closest to the cpu.
-
-If `plat_static_func` is NULL, static power is considered to be
-negligible for this platform and only dynamic power is considered.
-
-The platform specific callback can then use any combination of tables
-and/or equations to permute the estimated value.  Process grade
-information is not passed to the model since access to such data, from
-on-chip measurement capability or manufacture time data, is platform
-specific.
-
-Note: the significance of static power for CPUs in comparison to
-dynamic power is highly dependent on implementation.  Given the
-potential complexity in implementation, the importance and accuracy of
-its inclusion when using cpu cooling devices should be assessed on a
-case by case basis.
-

From ee1f4a7dafa997816ff3de96155c6f3edc21c1e6 Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 13 Dec 2017 12:27:39 +0530
Subject: [PATCH 51/89] powernv-cpufreq: Add helper to extract pstate from PMSR

On POWERNV platform, the fields for pstates in the Power Management
Status Register (PMSR) and the Power Management Control Register
(PMCR) are 8-bits wide. On POWER8 the pstates are negatively numbered
while on POWER9 they are positively numbered.

The device-tree exports pstates as 32-bit entries. The device-tree
implementation sign-extends the 8-bit pstate values to obtain the
corresponding 32-bit entry.

Eg: On POWER8, a pstate value 0x82 [-126] is represented in the
device-tree as 0xfffffff82 while on POWER9, the same value 0x82 [130]
is represented in the device-tree as 0x00000082.

The powernv-cpufreq driver implementation represents pstates using the
integer type. In multiple places in the driver, the code interprets
the pstates extracted from the PMSR as a signed byte and assigns it to
a integer variable to get the sign-extention.

On POWER9 platforms which have greater than 128 pstates, this results
in the driver performing incorrect sign-extention, and thereby
treating a legitimate pstate (say 130) as an invalid pstates (since it
is interpreted as -126).

This patch fixes the issue by implementing a helper function to
extract Pstates from PMSR register, and correctly sign-extend it to be
consistent with the values provided by the device-tree.

Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 37 +++++++++++++++++++------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index b6d7c4c98d0a..f46b60fb3084 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -41,11 +41,9 @@
 #define POWERNV_MAX_PSTATES	256
 #define PMSR_PSAFE_ENABLE	(1UL << 30)
 #define PMSR_SPR_EM_DISABLE	(1UL << 31)
-#define PMSR_MAX(x)		((x >> 32) & 0xFF)
+#define MAX_PSTATE_SHIFT	32
 #define LPSTATE_SHIFT		48
 #define GPSTATE_SHIFT		56
-#define GET_LPSTATE(x)		(((x) >> LPSTATE_SHIFT) & 0xFF)
-#define GET_GPSTATE(x)		(((x) >> GPSTATE_SHIFT) & 0xFF)
 
 #define MAX_RAMP_DOWN_TIME				5120
 /*
@@ -94,6 +92,7 @@ struct global_pstate_info {
 };
 
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
+u32 pstate_sign_prefix;
 static bool rebooting, throttled, occ_reset;
 
 static const char * const throttle_reason[] = {
@@ -148,6 +147,20 @@ static struct powernv_pstate_info {
 	bool wof_enabled;
 } powernv_pstate_info;
 
+static inline int extract_pstate(u64 pmsr_val, unsigned int shift)
+{
+	int ret = ((pmsr_val >> shift) & 0xFF);
+
+	if (!ret)
+		return ret;
+
+	return (pstate_sign_prefix | ret);
+}
+
+#define extract_local_pstate(x) extract_pstate(x, LPSTATE_SHIFT)
+#define extract_global_pstate(x) extract_pstate(x, GPSTATE_SHIFT)
+#define extract_max_pstate(x)  extract_pstate(x, MAX_PSTATE_SHIFT)
+
 /* Use following macros for conversions between pstate_id and index */
 static inline int idx_to_pstate(unsigned int i)
 {
@@ -278,6 +291,9 @@ next:
 
 	powernv_pstate_info.nr_pstates = nr_pstates;
 	pr_debug("NR PStates %d\n", nr_pstates);
+
+	pstate_sign_prefix = pstate_min & ~0xFF;
+
 	for (i = 0; i < nr_pstates; i++) {
 		u32 id = be32_to_cpu(pstate_ids[i]);
 		u32 freq = be32_to_cpu(pstate_freqs[i]);
@@ -438,17 +454,10 @@ struct powernv_smp_call_data {
 static void powernv_read_cpu_freq(void *arg)
 {
 	unsigned long pmspr_val;
-	s8 local_pstate_id;
 	struct powernv_smp_call_data *freq_data = arg;
 
 	pmspr_val = get_pmspr(SPRN_PMSR);
-
-	/*
-	 * The local pstate id corresponds bits 48..55 in the PMSR.
-	 * Note: Watch out for the sign!
-	 */
-	local_pstate_id = (pmspr_val >> 48) & 0xFF;
-	freq_data->pstate_id = local_pstate_id;
+	freq_data->pstate_id = extract_local_pstate(pmspr_val);
 	freq_data->freq = pstate_id_to_freq(freq_data->pstate_id);
 
 	pr_debug("cpu %d pmsr %016lX pstate_id %d frequency %d kHz\n",
@@ -522,7 +531,7 @@ static void powernv_cpufreq_throttle_check(void *data)
 	chip = this_cpu_read(chip_info);
 
 	/* Check for Pmax Capping */
-	pmsr_pmax = (s8)PMSR_MAX(pmsr);
+	pmsr_pmax = extract_max_pstate(pmsr);
 	pmsr_pmax_idx = pstate_to_idx(pmsr_pmax);
 	if (pmsr_pmax_idx != powernv_pstate_info.max) {
 		if (chip->throttled)
@@ -645,8 +654,8 @@ void gpstate_timer_handler(struct timer_list *t)
 	 * value. Hence, read from PMCR to get correct data.
 	 */
 	val = get_pmspr(SPRN_PMCR);
-	freq_data.gpstate_id = (s8)GET_GPSTATE(val);
-	freq_data.pstate_id = (s8)GET_LPSTATE(val);
+	freq_data.gpstate_id = extract_global_pstate(val);
+	freq_data.pstate_id = extract_local_pstate(val);
 	if (freq_data.gpstate_id  == freq_data.pstate_id) {
 		reset_gpstates(policy);
 		spin_unlock(&gpstates->gpstate_lock);

From 332f0a01f0dd669dcd208e4e9666d80dffd62e7b Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 13 Dec 2017 12:27:40 +0530
Subject: [PATCH 52/89] powernv-cpufreq: Fix pstate_to_idx() to handle
 non-continguous pstates

The code in powernv-cpufreq, makes the following two assumptions which
are not guaranteed by the device-tree bindings:

    1) Pstate ids are continguous: This is used in pstate_to_idx() to
       obtain the reverse map from a pstate to it's corresponding
       entry into the cpufreq frequency table.

    2) Every Pstate should always lie between the max and the min
       pstates that are explicitly reported in the device tree: This
       is used to determine whether a pstate reported by the PMSR is
       out of bounds.

Both these assumptions are unwarranted and can change on future
platforms.

In this patch, we maintain the reverse map from a pstate to it's index
in the cpufreq frequency table and use this in pstate_to_idx(). This
does away with the assumptions (1) mentioned above, and will work with
non continguous pstate ids. If no entry exists for a particular
pstate, then such a pstate is treated as being out of bounds. This
gets rid of assumption (2).

On all the existing platforms, where the pstates are 8-bit long
values, the new implementation of pstate_to_idx() takes constant time.

Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 85 +++++++++++++++++++++++--------
 1 file changed, 63 insertions(+), 22 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index f46b60fb3084..8e3dbcaee286 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -29,6 +29,7 @@
 #include <linux/reboot.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <linux/hashtable.h>
 #include <trace/events/power.h>
 
 #include <asm/cputhreads.h>
@@ -38,7 +39,8 @@
 #include <asm/opal.h>
 #include <linux/timer.h>
 
-#define POWERNV_MAX_PSTATES	256
+#define POWERNV_MAX_PSTATES_ORDER  8
+#define POWERNV_MAX_PSTATES	(1UL << (POWERNV_MAX_PSTATES_ORDER))
 #define PMSR_PSAFE_ENABLE	(1UL << 30)
 #define PMSR_SPR_EM_DISABLE	(1UL << 31)
 #define MAX_PSTATE_SHIFT	32
@@ -92,6 +94,27 @@ struct global_pstate_info {
 };
 
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
+
+DEFINE_HASHTABLE(pstate_revmap, POWERNV_MAX_PSTATES_ORDER);
+/**
+ * struct pstate_idx_revmap_data: Entry in the hashmap pstate_revmap
+ *				  indexed by a function of pstate id.
+ *
+ * @pstate_id: pstate id for this entry.
+ *
+ * @cpufreq_table_idx: Index into the powernv_freqs
+ *		       cpufreq_frequency_table for frequency
+ *		       corresponding to pstate_id.
+ *
+ * @hentry: hlist_node that hooks this entry into the pstate_revmap
+ *	    hashtable
+ */
+struct pstate_idx_revmap_data {
+	int pstate_id;
+	unsigned int cpufreq_table_idx;
+	struct hlist_node hentry;
+};
+
 u32 pstate_sign_prefix;
 static bool rebooting, throttled, occ_reset;
 
@@ -161,39 +184,47 @@ static inline int extract_pstate(u64 pmsr_val, unsigned int shift)
 #define extract_global_pstate(x) extract_pstate(x, GPSTATE_SHIFT)
 #define extract_max_pstate(x)  extract_pstate(x, MAX_PSTATE_SHIFT)
 
-/* Use following macros for conversions between pstate_id and index */
+/* Use following functions for conversions between pstate_id and index */
+
+/**
+ * idx_to_pstate : Returns the pstate id corresponding to the
+ *		   frequency in the cpufreq frequency table
+ *		   powernv_freqs indexed by @i.
+ *
+ *		   If @i is out of bound, this will return the pstate
+ *		   corresponding to the nominal frequency.
+ */
 static inline int idx_to_pstate(unsigned int i)
 {
 	if (unlikely(i >= powernv_pstate_info.nr_pstates)) {
-		pr_warn_once("index %u is out of bound\n", i);
+		pr_warn_once("idx_to_pstate: index %u is out of bound\n", i);
 		return powernv_freqs[powernv_pstate_info.nominal].driver_data;
 	}
 
 	return powernv_freqs[i].driver_data;
 }
 
-static inline unsigned int pstate_to_idx(int pstate)
+/**
+ * pstate_to_idx : Returns the index in the cpufreq frequencytable
+ *		   powernv_freqs for the frequency whose corresponding
+ *		   pstate id is @pstate.
+ *
+ *		   If no frequency corresponding to @pstate is found,
+ *		   this will return the index of the nominal
+ *		   frequency.
+ */
+static unsigned int pstate_to_idx(int pstate)
 {
-	int min = powernv_freqs[powernv_pstate_info.min].driver_data;
-	int max = powernv_freqs[powernv_pstate_info.max].driver_data;
+	unsigned int key = pstate % POWERNV_MAX_PSTATES;
+	struct pstate_idx_revmap_data *revmap_data;
 
-	if (min > 0) {
-		if (unlikely((pstate < max) || (pstate > min))) {
-			pr_warn_once("pstate %d is out of bound\n", pstate);
-			return powernv_pstate_info.nominal;
-		}
-	} else {
-		if (unlikely((pstate > max) || (pstate < min))) {
-			pr_warn_once("pstate %d is out of bound\n", pstate);
-			return powernv_pstate_info.nominal;
-		}
+	hash_for_each_possible(pstate_revmap, revmap_data, hentry, key) {
+		if (revmap_data->pstate_id == pstate)
+			return revmap_data->cpufreq_table_idx;
 	}
-	/*
-	 * abs() is deliberately used so that is works with
-	 * both monotonically increasing and decreasing
-	 * pstate values
-	 */
-	return abs(pstate - idx_to_pstate(powernv_pstate_info.max));
+
+	pr_warn_once("pstate_to_idx: pstate %d not found\n", pstate);
+	return powernv_pstate_info.nominal;
 }
 
 static inline void reset_gpstates(struct cpufreq_policy *policy)
@@ -297,11 +328,21 @@ next:
 	for (i = 0; i < nr_pstates; i++) {
 		u32 id = be32_to_cpu(pstate_ids[i]);
 		u32 freq = be32_to_cpu(pstate_freqs[i]);
+		struct pstate_idx_revmap_data *revmap_data;
+		unsigned int key;
 
 		pr_debug("PState id %d freq %d MHz\n", id, freq);
 		powernv_freqs[i].frequency = freq * 1000; /* kHz */
 		powernv_freqs[i].driver_data = id;
 
+		revmap_data = (struct pstate_idx_revmap_data *)
+			      kmalloc(sizeof(*revmap_data), GFP_KERNEL);
+
+		revmap_data->pstate_id = id;
+		revmap_data->cpufreq_table_idx = i;
+		key = id % POWERNV_MAX_PSTATES;
+		hash_add(pstate_revmap, &revmap_data->hentry, key);
+
 		if (id == pstate_max)
 			powernv_pstate_info.max = i;
 		else if (id == pstate_nominal)

From 967b87fd81d513de7aac247320f02b1645f9ca64 Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 13 Dec 2017 12:27:41 +0530
Subject: [PATCH 53/89] powernv-cpufreq: Treat pstates as opaque 8-bit values

On POWER8 and POWER9, the PMSR and the PMCR registers define pstates
to be 8-bit wide values. The device-tree exports pstates as 32-bit
wide values of which the lower byte is the actual pstate.

The current implementation in the kernel treats pstates as integer
type, since it used to use the sign of the pstate for performing some
boundary-checks. This is no longer required after the patch
"powernv-cpufreq: Fix pstate_to_idx() to handle non-continguous
pstates".

So, in this patch, we modify the powernv-cpufreq driver to uniformly
treat pstates as opaque 8-bit values obtained from the device-tree or
the PMCR. This simplifies the extract_pstate() helper function since
we no longer no longer require to worry about the sign-extentions.

Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 47 +++++++++++++------------------
 1 file changed, 19 insertions(+), 28 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 8e3dbcaee286..8a4e2ce0804c 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -110,12 +110,11 @@ DEFINE_HASHTABLE(pstate_revmap, POWERNV_MAX_PSTATES_ORDER);
  *	    hashtable
  */
 struct pstate_idx_revmap_data {
-	int pstate_id;
+	u8 pstate_id;
 	unsigned int cpufreq_table_idx;
 	struct hlist_node hentry;
 };
 
-u32 pstate_sign_prefix;
 static bool rebooting, throttled, occ_reset;
 
 static const char * const throttle_reason[] = {
@@ -170,14 +169,9 @@ static struct powernv_pstate_info {
 	bool wof_enabled;
 } powernv_pstate_info;
 
-static inline int extract_pstate(u64 pmsr_val, unsigned int shift)
+static inline u8 extract_pstate(u64 pmsr_val, unsigned int shift)
 {
-	int ret = ((pmsr_val >> shift) & 0xFF);
-
-	if (!ret)
-		return ret;
-
-	return (pstate_sign_prefix | ret);
+	return ((pmsr_val >> shift) & 0xFF);
 }
 
 #define extract_local_pstate(x) extract_pstate(x, LPSTATE_SHIFT)
@@ -194,7 +188,7 @@ static inline int extract_pstate(u64 pmsr_val, unsigned int shift)
  *		   If @i is out of bound, this will return the pstate
  *		   corresponding to the nominal frequency.
  */
-static inline int idx_to_pstate(unsigned int i)
+static inline u8 idx_to_pstate(unsigned int i)
 {
 	if (unlikely(i >= powernv_pstate_info.nr_pstates)) {
 		pr_warn_once("idx_to_pstate: index %u is out of bound\n", i);
@@ -213,7 +207,7 @@ static inline int idx_to_pstate(unsigned int i)
  *		   this will return the index of the nominal
  *		   frequency.
  */
-static unsigned int pstate_to_idx(int pstate)
+static unsigned int pstate_to_idx(u8 pstate)
 {
 	unsigned int key = pstate % POWERNV_MAX_PSTATES;
 	struct pstate_idx_revmap_data *revmap_data;
@@ -223,7 +217,7 @@ static unsigned int pstate_to_idx(int pstate)
 			return revmap_data->cpufreq_table_idx;
 	}
 
-	pr_warn_once("pstate_to_idx: pstate %d not found\n", pstate);
+	pr_warn_once("pstate_to_idx: pstate 0x%x not found\n", pstate);
 	return powernv_pstate_info.nominal;
 }
 
@@ -291,7 +285,7 @@ static int init_powernv_pstates(void)
 		powernv_pstate_info.wof_enabled = true;
 
 next:
-	pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
+	pr_info("cpufreq pstate min 0x%x nominal 0x%x max 0x%x\n", pstate_min,
 		pstate_nominal, pstate_max);
 	pr_info("Workload Optimized Frequency is %s in the platform\n",
 		(powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
@@ -323,8 +317,6 @@ next:
 	powernv_pstate_info.nr_pstates = nr_pstates;
 	pr_debug("NR PStates %d\n", nr_pstates);
 
-	pstate_sign_prefix = pstate_min & ~0xFF;
-
 	for (i = 0; i < nr_pstates; i++) {
 		u32 id = be32_to_cpu(pstate_ids[i]);
 		u32 freq = be32_to_cpu(pstate_freqs[i]);
@@ -333,14 +325,14 @@ next:
 
 		pr_debug("PState id %d freq %d MHz\n", id, freq);
 		powernv_freqs[i].frequency = freq * 1000; /* kHz */
-		powernv_freqs[i].driver_data = id;
+		powernv_freqs[i].driver_data = id & 0xFF;
 
 		revmap_data = (struct pstate_idx_revmap_data *)
 			      kmalloc(sizeof(*revmap_data), GFP_KERNEL);
 
-		revmap_data->pstate_id = id;
+		revmap_data->pstate_id = id & 0xFF;
 		revmap_data->cpufreq_table_idx = i;
-		key = id % POWERNV_MAX_PSTATES;
+		key = (revmap_data->pstate_id) % POWERNV_MAX_PSTATES;
 		hash_add(pstate_revmap, &revmap_data->hentry, key);
 
 		if (id == pstate_max)
@@ -364,14 +356,13 @@ next:
 }
 
 /* Returns the CPU frequency corresponding to the pstate_id. */
-static unsigned int pstate_id_to_freq(int pstate_id)
+static unsigned int pstate_id_to_freq(u8 pstate_id)
 {
 	int i;
 
 	i = pstate_to_idx(pstate_id);
 	if (i >= powernv_pstate_info.nr_pstates || i < 0) {
-		pr_warn("PState id %d outside of PState table, "
-			"reporting nominal id %d instead\n",
+		pr_warn("PState id 0x%x outside of PState table, reporting nominal id 0x%x instead\n",
 			pstate_id, idx_to_pstate(powernv_pstate_info.nominal));
 		i = powernv_pstate_info.nominal;
 	}
@@ -477,8 +468,8 @@ static inline void set_pmspr(unsigned long sprn, unsigned long val)
  */
 struct powernv_smp_call_data {
 	unsigned int freq;
-	int pstate_id;
-	int gpstate_id;
+	u8 pstate_id;
+	u8 gpstate_id;
 };
 
 /*
@@ -501,9 +492,9 @@ static void powernv_read_cpu_freq(void *arg)
 	freq_data->pstate_id = extract_local_pstate(pmspr_val);
 	freq_data->freq = pstate_id_to_freq(freq_data->pstate_id);
 
-	pr_debug("cpu %d pmsr %016lX pstate_id %d frequency %d kHz\n",
-		raw_smp_processor_id(), pmspr_val, freq_data->pstate_id,
-		freq_data->freq);
+	pr_debug("cpu %d pmsr %016lX pstate_id 0x%x frequency %d kHz\n",
+		 raw_smp_processor_id(), pmspr_val, freq_data->pstate_id,
+		 freq_data->freq);
 }
 
 /*
@@ -565,7 +556,7 @@ static void powernv_cpufreq_throttle_check(void *data)
 	struct chip *chip;
 	unsigned int cpu = smp_processor_id();
 	unsigned long pmsr;
-	int pmsr_pmax;
+	u8 pmsr_pmax;
 	unsigned int pmsr_pmax_idx;
 
 	pmsr = get_pmspr(SPRN_PMSR);
@@ -579,7 +570,7 @@ static void powernv_cpufreq_throttle_check(void *data)
 			goto next;
 		chip->throttled = true;
 		if (pmsr_pmax_idx > powernv_pstate_info.nominal) {
-			pr_warn_once("CPU %d on Chip %u has Pmax(%d) reduced below nominal frequency(%d)\n",
+			pr_warn_once("CPU %d on Chip %u has Pmax(0x%x) reduced below that of nominal frequency(0x%x)\n",
 				     cpu, chip->id, pmsr_pmax,
 				     idx_to_pstate(powernv_pstate_info.nominal));
 			chip->throttle_sub_turbo++;

From d476ec4f7f5aba47b0a570cbf659d7330d7e71cf Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jan 2018 08:53:54 +0530
Subject: [PATCH 54/89] cpufreq: stats: Change return type of
 cpufreq_stats_update() as void

It always returns 0 and none of its callers check its return value. Make
it return void.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 1e55b5790853..1572129844a5 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -27,7 +27,7 @@ struct cpufreq_stats {
 	unsigned int *trans_table;
 };
 
-static int cpufreq_stats_update(struct cpufreq_stats *stats)
+static void cpufreq_stats_update(struct cpufreq_stats *stats)
 {
 	unsigned long long cur_time = get_jiffies_64();
 
@@ -35,7 +35,6 @@ static int cpufreq_stats_update(struct cpufreq_stats *stats)
 	stats->time_in_state[stats->last_index] += cur_time - stats->last_time;
 	stats->last_time = cur_time;
 	spin_unlock(&cpufreq_stats_lock);
-	return 0;
 }
 
 static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)

From 5a2772a82034722b4d4c7a2d4bfd07939ee46926 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Tue, 2 Jan 2018 14:27:57 +0100
Subject: [PATCH 55/89] PM / AVS: rockchip-io: account for const type of
 of_device_id.data

This driver creates a number of const structures that it stores in the
data field of an of_device_id array.

The data field of an of_device_id structure has type const void *, so
there is no need for a const-discarding cast when putting const values
into such a structure.

Furthermore, adding const to the declaration of the location that
receives a const value from such a field ensures that the compiler
will continue to check that the value is not modified.  The
const-discarding cast on the extraction from the data field is
thus no longer needed.

Done using Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/power/avs/rockchip-io-domain.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/power/avs/rockchip-io-domain.c b/drivers/power/avs/rockchip-io-domain.c
index 75f63e38a8d1..ed2b109ae8fc 100644
--- a/drivers/power/avs/rockchip-io-domain.c
+++ b/drivers/power/avs/rockchip-io-domain.c
@@ -76,7 +76,7 @@ struct rockchip_iodomain_supply {
 struct rockchip_iodomain {
 	struct device *dev;
 	struct regmap *grf;
-	struct rockchip_iodomain_soc_data *soc_data;
+	const struct rockchip_iodomain_soc_data *soc_data;
 	struct rockchip_iodomain_supply supplies[MAX_SUPPLIES];
 };
 
@@ -382,43 +382,43 @@ static const struct rockchip_iodomain_soc_data soc_data_rv1108_pmu = {
 static const struct of_device_id rockchip_iodomain_match[] = {
 	{
 		.compatible = "rockchip,rk3188-io-voltage-domain",
-		.data = (void *)&soc_data_rk3188
+		.data = &soc_data_rk3188
 	},
 	{
 		.compatible = "rockchip,rk3228-io-voltage-domain",
-		.data = (void *)&soc_data_rk3228
+		.data = &soc_data_rk3228
 	},
 	{
 		.compatible = "rockchip,rk3288-io-voltage-domain",
-		.data = (void *)&soc_data_rk3288
+		.data = &soc_data_rk3288
 	},
 	{
 		.compatible = "rockchip,rk3328-io-voltage-domain",
-		.data = (void *)&soc_data_rk3328
+		.data = &soc_data_rk3328
 	},
 	{
 		.compatible = "rockchip,rk3368-io-voltage-domain",
-		.data = (void *)&soc_data_rk3368
+		.data = &soc_data_rk3368
 	},
 	{
 		.compatible = "rockchip,rk3368-pmu-io-voltage-domain",
-		.data = (void *)&soc_data_rk3368_pmu
+		.data = &soc_data_rk3368_pmu
 	},
 	{
 		.compatible = "rockchip,rk3399-io-voltage-domain",
-		.data = (void *)&soc_data_rk3399
+		.data = &soc_data_rk3399
 	},
 	{
 		.compatible = "rockchip,rk3399-pmu-io-voltage-domain",
-		.data = (void *)&soc_data_rk3399_pmu
+		.data = &soc_data_rk3399_pmu
 	},
 	{
 		.compatible = "rockchip,rv1108-io-voltage-domain",
-		.data = (void *)&soc_data_rv1108
+		.data = &soc_data_rv1108
 	},
 	{
 		.compatible = "rockchip,rv1108-pmu-io-voltage-domain",
-		.data = (void *)&soc_data_rv1108_pmu
+		.data = &soc_data_rv1108_pmu
 	},
 	{ /* sentinel */ },
 };
@@ -443,7 +443,7 @@ static int rockchip_iodomain_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, iod);
 
 	match = of_match_node(rockchip_iodomain_match, np);
-	iod->soc_data = (struct rockchip_iodomain_soc_data *)match->data;
+	iod->soc_data = match->data;
 
 	parent = pdev->dev.parent;
 	if (parent && parent->of_node) {

From 3cfd68b5ba8737d28bfcf9b6487ea4d9216b8504 Mon Sep 17 00:00:00 2001
From: gaurav jindal <gauravjindal1104@gmail.com>
Date: Fri, 5 Jan 2018 14:01:30 +0100
Subject: [PATCH 56/89] cpuidle: Avoid NULL argument in
 cpuidle_switch_governor()

Checks if the new governor is NULL before updating the
cupidle_curr_governor.

Signed-off-by: gaurav jindal <gauravjindal1104@gmail.com>
[ rjw : Subject ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/governor.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index 4e78263e34a4..5d359aff3cc5 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -36,14 +36,15 @@ static struct cpuidle_governor * __cpuidle_find_governor(const char *str)
 /**
  * cpuidle_switch_governor - changes the governor
  * @gov: the new target governor
- *
- * NOTE: "gov" can be NULL to specify disabled
  * Must be called with cpuidle_lock acquired.
  */
 int cpuidle_switch_governor(struct cpuidle_governor *gov)
 {
 	struct cpuidle_device *dev;
 
+	if (!gov)
+		return -EINVAL;
+
 	if (gov == cpuidle_curr_governor)
 		return 0;
 

From bdbc98abb3aa323f6323b11db39c740e6f8fc5b1 Mon Sep 17 00:00:00 2001
From: Rainer Fiebig <jrf@mailbox.org>
Date: Fri, 22 Dec 2017 11:13:59 +0100
Subject: [PATCH 57/89] PM: hibernate: Do not subtract NR_FILE_MAPPED in
 minimum_image_size()

s2disk/s2both may fail unnecessarily and erratically if NR_FILE_MAPPED
is high - for instance when using VMs with VirtualBox and perhaps VMware
Player. In those situations s2disk becomes unreliable and therefore
unusable.

A typical scenario is: user issues a s2disk and it fails. User issues
a second s2disk immediately after that and it succeeds.  And user
wonders why.

The problem is caused by minimum_image_size() in snapshot.c.  The
value it returns is roughly 100% too high because NR_FILE_MAPPED is
subtracted in its calculation.  Eventually the number of preallocated
image pages is falsely too low.

This doesn't matter as long as NR_FILE_MAPPED-values are in a normal
range or in 32bit-environments as the code allows for allocation of
additional pages from highmem.

But with the high values generated by VirtualBox-VMs (a 2-GB-VM causes
NR_FILE_MAPPED go up by 2 GB) it may lead to failure in 64bit-systems.

Not subtracting NR_FILE_MAPPED in minimum_image_size() solves the
problem.

I've done at least hundreds of successful s2both/s2disk now on an
x86_64 system (with and without VirtualBox) which gives me some
confidence that this is right.  It has turned s2disk/s2both from
unusable into 100% reliable.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=97201
Signed-off-by: Rainer Fiebig <jrf@mailbox.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index bce0464524d8..3d37c279c090 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1645,8 +1645,7 @@ static unsigned long free_unnecessary_pages(void)
  * [number of saveable pages] - [number of pages that can be freed in theory]
  *
  * where the second term is the sum of (1) reclaimable slab pages, (2) active
- * and (3) inactive anonymous pages, (4) active and (5) inactive file pages,
- * minus mapped file pages.
+ * and (3) inactive anonymous pages, (4) active and (5) inactive file pages.
  */
 static unsigned long minimum_image_size(unsigned long saveable)
 {
@@ -1656,8 +1655,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
 		+ global_node_page_state(NR_ACTIVE_ANON)
 		+ global_node_page_state(NR_INACTIVE_ANON)
 		+ global_node_page_state(NR_ACTIVE_FILE)
-		+ global_node_page_state(NR_INACTIVE_FILE)
-		- global_node_page_state(NR_FILE_MAPPED);
+		+ global_node_page_state(NR_INACTIVE_FILE);
 
 	return saveable <= size ? 0 : saveable - size;
 }

From 7bf4e594c28afc67bc120a380ca774e43ca496d8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 5 Jan 2018 02:18:42 +0100
Subject: [PATCH 58/89] PM / wakeup: Do not fail dev_pm_attach_wake_irq()
 unnecessarily

Returning an error code from dev_pm_attach_wake_irq() if
device_wakeup_attach_irq() called by it returns an error is
pointless, because the wakeup source used by it may be deleted
by user space via sysfs at any time and in particular right after
dev_pm_attach_wake_irq() has returned.  Moreover, it requires
the callers of dev_pm_attach_wake_irq() to create that wakeup
source via device_wakeup_enable() upfront, but that obviously is
racy with respect to the sysfs-based manipulations of it.

To avoid the race, modify device_wakeup_attach_irq() to check
that the wakeup source it is going to use is there (and return
early otherwise), make it void (as it cannot fail after that
change) and make dev_pm_attach_wake_irq() simply call it for
the device unconditionally.

Tested-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/power.h   | 11 +++--------
 drivers/base/power/wakeirq.c |  8 +++-----
 drivers/base/power/wakeup.c  | 11 ++++-------
 3 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index 7beee75399d4..21244c53e377 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -41,20 +41,15 @@ extern void dev_pm_disable_wake_irq_check(struct device *dev);
 
 #ifdef CONFIG_PM_SLEEP
 
-extern int device_wakeup_attach_irq(struct device *dev,
-				    struct wake_irq *wakeirq);
+extern void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq);
 extern void device_wakeup_detach_irq(struct device *dev);
 extern void device_wakeup_arm_wake_irqs(void);
 extern void device_wakeup_disarm_wake_irqs(void);
 
 #else
 
-static inline int
-device_wakeup_attach_irq(struct device *dev,
-			 struct wake_irq *wakeirq)
-{
-	return 0;
-}
+static inline void device_wakeup_attach_irq(struct device *dev,
+					    struct wake_irq *wakeirq) {}
 
 static inline void device_wakeup_detach_irq(struct device *dev)
 {
diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c
index ae0429827f31..a8ac86e4d79e 100644
--- a/drivers/base/power/wakeirq.c
+++ b/drivers/base/power/wakeirq.c
@@ -33,7 +33,6 @@ static int dev_pm_attach_wake_irq(struct device *dev, int irq,
 				  struct wake_irq *wirq)
 {
 	unsigned long flags;
-	int err;
 
 	if (!dev || !wirq)
 		return -EINVAL;
@@ -45,12 +44,11 @@ static int dev_pm_attach_wake_irq(struct device *dev, int irq,
 		return -EEXIST;
 	}
 
-	err = device_wakeup_attach_irq(dev, wirq);
-	if (!err)
-		dev->power.wakeirq = wirq;
+	dev->power.wakeirq = wirq;
+	device_wakeup_attach_irq(dev, wirq);
 
 	spin_unlock_irqrestore(&dev->power.lock, flags);
-	return err;
+	return 0;
 }
 
 /**
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index b7b8b2fe89c6..e73a081c6397 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -291,22 +291,19 @@ EXPORT_SYMBOL_GPL(device_wakeup_enable);
  *
  * Call under the device's power.lock lock.
  */
-int device_wakeup_attach_irq(struct device *dev,
+void device_wakeup_attach_irq(struct device *dev,
 			     struct wake_irq *wakeirq)
 {
 	struct wakeup_source *ws;
 
 	ws = dev->power.wakeup;
-	if (!ws) {
-		dev_err(dev, "forgot to call device_init_wakeup?\n");
-		return -EINVAL;
-	}
+	if (!ws)
+		return;
 
 	if (ws->wakeirq)
-		return -EEXIST;
+		dev_err(dev, "Leftover wakeup IRQ found, overriding\n");
 
 	ws->wakeirq = wakeirq;
-	return 0;
 }
 
 /**

From 8512220c5782d3e469cf8127a612a6c8f521e2dc Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 2 Jan 2018 17:08:50 +0100
Subject: [PATCH 59/89] PM / core: Assign the wakeup_path status flag in
 __device_prepare()

The PM core in the device_prepare() phase, resets the wakeup_path status
flag to the value of device_may_wakeup(). This means if a ->prepare() or a
->suspend() callback for the device would update the device's wakeup
setting, this doesn't become reflected in the wakeup_path status flag.

In general this isn't a problem, because wakeup settings are not supposed
to be changed (via for example calling device_set_wakeup_enable()) during
any system wide suspend/resume phase.  Nevertheless there are some users,
which can be considered as legacy, that don't conform to this behaviour.

These legacy cases should be corrected, however until that is done, let's
address the issue from the PM core, by moving the assignment of the
wakeup_path status flag to the __device_suspend() phase and after the
->suspend() callback has been invoked.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 70398e7b3569..ebcec7e677ba 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1788,6 +1788,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
  End:
 	if (!error) {
 		dev->power.is_suspended = true;
+		if (device_may_wakeup(dev))
+			dev->power.wakeup_path = true;
+
 		dpm_propagate_to_parent(dev);
 		dpm_clear_suppliers_direct_complete(dev);
 	}
@@ -1912,7 +1915,7 @@ static int device_prepare(struct device *dev, pm_message_t state)
 
 	device_lock(dev);
 
-	dev->power.wakeup_path = device_may_wakeup(dev);
+	dev->power.wakeup_path = false;
 
 	if (dev->power.no_pm_callbacks) {
 		ret = 1;	/* Let device go direct_complete */

From cf04ce7841fabc7af0d6ee273711ec29658bee7b Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 2 Jan 2018 17:08:52 +0100
Subject: [PATCH 60/89] PM / wakeup: Add device_set_wakeup_path() helper to
 control wakeup path

During system suspend, a driver may find that the wakeup setting is
enabled for its device and therefore configures it to deliver system
wakeup signals.

Additionally, sometimes the driver and its device, relies on some
further consumed resource, like an irqchip or a phy for example, to
stay powered on, as to be able to deliver system wakeup signals.

In general the driver deals with this, via raising an "enable count"
of the consumed resource or via a subsystem specific API, like
irq_set_irq_wake() or enable|disable_irq_wake() for an irqchip.
However, this may not be sufficient in cases when the resource's
device may be attached to a PM domain (genpd for example) or is
handled by a non-trivial middle layer (PCI for example).

To address cases like these, the existing ->dev.power.wakeup_path
status flag is there to help.  As a matter of fact, genpd already
monitors the flag during system suspend and acts accordingly.

However, so far it has not been clear, if anybody else but the PM
core is allowed to set the ->dev.power.wakeup_path status flag,
which is required to make this work.  For this reason, introduce
a new helper function, device_set_wakeup_path() for that.

Typically, a driver that manages a resource needed in the wakeup path
should call device_set_wakeup_path() from its ->suspend() or
->suspend_late() callback.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_wakeup.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 4c2cba7ec1d4..4238dde0aaf0 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -88,6 +88,11 @@ static inline bool device_may_wakeup(struct device *dev)
 	return dev->power.can_wakeup && !!dev->power.wakeup;
 }
 
+static inline void device_set_wakeup_path(struct device *dev)
+{
+	dev->power.wakeup_path = true;
+}
+
 /* drivers/base/power/wakeup.c */
 extern void wakeup_source_prepare(struct wakeup_source *ws, const char *name);
 extern struct wakeup_source *wakeup_source_create(const char *name);
@@ -174,6 +179,8 @@ static inline bool device_may_wakeup(struct device *dev)
 	return dev->power.can_wakeup && dev->power.should_wakeup;
 }
 
+static inline void device_set_wakeup_path(struct device *dev) {}
+
 static inline void __pm_stay_awake(struct wakeup_source *ws) {}
 
 static inline void pm_stay_awake(struct device *dev) {}

From 877b3729ca03b00800b99ac0c076e9456ef3ae6b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 3 Jan 2018 01:38:27 +0100
Subject: [PATCH 61/89] PCI / PM: Use SMART_SUSPEND and LEAVE_SUSPENDED flags
 for PCIe ports

Make the PCIe port driver set DPM_FLAG_SMART_SUSPEND and
DPM_FLAG_LEAVE_SUSPENDED for the devices handled by it to benefit
from the opportunistic optimizations in the PCI layer enabled by
these flags.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pcie/portdrv_pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index ffbf4e723527..fb1c1bb87316 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -150,6 +150,9 @@ static int pcie_portdrv_probe(struct pci_dev *dev,
 
 	pci_save_state(dev);
 
+	dev_pm_set_driver_flags(&dev->dev, DPM_FLAG_SMART_SUSPEND |
+					   DPM_FLAG_LEAVE_SUSPENDED);
+
 	if (pci_bridge_d3_possible(dev)) {
 		/*
 		 * Keep the port resumed 100ms to make sure things like

From 8425ec7faff005500aad89b9fc00e5ba91ac57b9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 3 Jan 2018 01:34:53 +0100
Subject: [PATCH 62/89] PM / mfd: intel-lpss: Use DPM_FLAG_SMART_SUSPEND

Make the intel-lpss driver set DPM_FLAG_SMART_SUSPEND for its
devices which will allow them to stay in runtime suspend during
system suspend unless they need to be reconfigured for some reason.

Also make it avoid resuming its child devices if they have
DPM_FLAG_SMART_SUSPEND set to allow them to remain in runtime
suspend during system suspend.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-for-MFD-by: Lee Jones <lee.jones@linaro.org>
Tested-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
---
 drivers/mfd/intel-lpss.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c
index 0e0ab9bb1530..9e545eb6e8b4 100644
--- a/drivers/mfd/intel-lpss.c
+++ b/drivers/mfd/intel-lpss.c
@@ -450,6 +450,8 @@ int intel_lpss_probe(struct device *dev,
 	if (ret)
 		goto err_remove_ltr;
 
+	dev_pm_set_driver_flags(dev, DPM_FLAG_SMART_SUSPEND);
+
 	return 0;
 
 err_remove_ltr:
@@ -478,7 +480,9 @@ EXPORT_SYMBOL_GPL(intel_lpss_remove);
 
 static int resume_lpss_device(struct device *dev, void *data)
 {
-	pm_runtime_resume(dev);
+	if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND))
+		pm_runtime_resume(dev);
+
 	return 0;
 }
 

From 422cb781e0d0f81789a1cc0f2171611028450f09 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 3 Jan 2018 01:35:54 +0100
Subject: [PATCH 63/89] PM: i2c-designware-platdrv: Use DPM_FLAG_SMART_PREPARE

Modify i2c-designware-platdrv to set DPM_FLAG_SMART_PREPARE for its
devices and return 0 from the system suspend ->prepare callback
if the device has an ACPI companion object in order to tell the PM
core and middle layers to avoid skipping system suspend/resume
callbacks for the device in that case (which may be problematic,
because the device may be accessed during suspend and resume of
other devices via I2C operation regions then).

Also the pm_runtime_suspended() check in dw_i2c_plat_prepare()
is not necessary any more, because the core does it when setting
power.direct_complete for the device, so drop it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Acked-by: Wolfram Sang <wsa@the-dreams.de>
Tested-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
---
 drivers/i2c/busses/i2c-designware-platdrv.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c
index 58add69a441c..4f90a6dc186f 100644
--- a/drivers/i2c/busses/i2c-designware-platdrv.c
+++ b/drivers/i2c/busses/i2c-designware-platdrv.c
@@ -372,6 +372,8 @@ static int dw_i2c_plat_probe(struct platform_device *pdev)
 	ACPI_COMPANION_SET(&adap->dev, ACPI_COMPANION(&pdev->dev));
 	adap->dev.of_node = pdev->dev.of_node;
 
+	dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_SMART_PREPARE);
+
 	/* The code below assumes runtime PM to be disabled. */
 	WARN_ON(pm_runtime_enabled(&pdev->dev));
 
@@ -435,7 +437,13 @@ MODULE_DEVICE_TABLE(of, dw_i2c_of_match);
 #ifdef CONFIG_PM_SLEEP
 static int dw_i2c_plat_prepare(struct device *dev)
 {
-	return pm_runtime_suspended(dev);
+	/*
+	 * If the ACPI companion device object is present for this device, it
+	 * may be accessed during suspend and resume of other devices via I2C
+	 * operation regions, so tell the PM core and middle layers to avoid
+	 * skipping system suspend/resume callbacks for it in that case.
+	 */
+	return !has_acpi_companion(dev);
 }
 
 static void dw_i2c_plat_complete(struct device *dev)

From 02e45646d53bdb38bfb47b83765778d3ecb4d3b3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 3 Jan 2018 01:37:34 +0100
Subject: [PATCH 64/89] PM: i2c-designware-platdrv: Optimize power management

Optimize the power management in i2c-designware-platdrv by making it
set the DPM_FLAG_SMART_SUSPEND and DPM_FLAG_LEAVE_SUSPENDED which
allows some code to be dropped from its PM callbacks.

First, setting DPM_FLAG_SMART_SUSPEND causes the intel-lpss driver
to avoid resuming i2c-designware-platdrv devices in its ->prepare
callback, so they can stay in runtime suspend after that point even
if the direct-complete feature is not used for them.

It also causes the ACPI PM domain and the PM core to avoid invoking
"late" and "noirq" suspend callbacks for these devices if they are
in runtime suspend at the beginning of the "late" phase of device
suspend during system suspend.  That guarantees dw_i2c_plat_suspend()
to be called for a device only if it is not in runtime suspend.

Moreover, it causes the device's runtime PM status to be set to
"active" after calling dw_i2c_plat_resume() for it, so the
driver doesn't need internal flags to avoid invoking either
dw_i2c_plat_suspend() or dw_i2c_plat_resume() twice in a row.

Second, setting DPM_FLAG_LEAVE_SUSPENDED enables the optimization
allowing the device to stay suspended after system resume under
suitable conditions, so again the driver doesn't need to take
care of that by itself.

Accordingly, the internal "suspended" and "skip_resume" flags
used by the driver are not necessary any more, so drop them and
simplify the driver's PM callbacks.

Additionally, notice that dw_i2c_plat_complete() only needs to
schedule runtime PM resume for the device if platform firmware
has been involved in resuming the system, so make it call
pm_resume_via_firmware() to check that.  Also make it check the
runtime PM status of the device instead of its direct_complete
flag which also works if the device remained suspended due to
the DPM_FLAG_LEAVE_SUSPENDED driver flag.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Acked-by: Wolfram Sang <wsa@the-dreams.de>
Tested-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
---
 drivers/i2c/busses/i2c-designware-core.h    |  2 --
 drivers/i2c/busses/i2c-designware-platdrv.c | 31 ++++++++-------------
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h
index 21bf619a86c5..9fee4c054d3d 100644
--- a/drivers/i2c/busses/i2c-designware-core.h
+++ b/drivers/i2c/busses/i2c-designware-core.h
@@ -280,8 +280,6 @@ struct dw_i2c_dev {
 	int			(*acquire_lock)(struct dw_i2c_dev *dev);
 	void			(*release_lock)(struct dw_i2c_dev *dev);
 	bool			pm_disabled;
-	bool			suspended;
-	bool			skip_resume;
 	void			(*disable)(struct dw_i2c_dev *dev);
 	void			(*disable_int)(struct dw_i2c_dev *dev);
 	int			(*init)(struct dw_i2c_dev *dev);
diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c
index 4f90a6dc186f..153b947702c5 100644
--- a/drivers/i2c/busses/i2c-designware-platdrv.c
+++ b/drivers/i2c/busses/i2c-designware-platdrv.c
@@ -42,6 +42,7 @@
 #include <linux/reset.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/suspend.h>
 
 #include "i2c-designware-core.h"
 
@@ -372,7 +373,10 @@ static int dw_i2c_plat_probe(struct platform_device *pdev)
 	ACPI_COMPANION_SET(&adap->dev, ACPI_COMPANION(&pdev->dev));
 	adap->dev.of_node = pdev->dev.of_node;
 
-	dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_SMART_PREPARE);
+	dev_pm_set_driver_flags(&pdev->dev,
+				DPM_FLAG_SMART_PREPARE |
+				DPM_FLAG_SMART_SUSPEND |
+				DPM_FLAG_LEAVE_SUSPENDED);
 
 	/* The code below assumes runtime PM to be disabled. */
 	WARN_ON(pm_runtime_enabled(&pdev->dev));
@@ -448,7 +452,13 @@ static int dw_i2c_plat_prepare(struct device *dev)
 
 static void dw_i2c_plat_complete(struct device *dev)
 {
-	if (dev->power.direct_complete)
+	/*
+	 * The device can only be in runtime suspend at this point if it has not
+	 * been resumed throughout the ending system suspend/resume cycle, so if
+	 * the platform firmware might mess up with it, request the runtime PM
+	 * framework to resume it.
+	 */
+	if (pm_runtime_suspended(dev) && pm_resume_via_firmware())
 		pm_request_resume(dev);
 }
 #else
@@ -461,16 +471,9 @@ static int dw_i2c_plat_suspend(struct device *dev)
 {
 	struct dw_i2c_dev *i_dev = dev_get_drvdata(dev);
 
-	if (i_dev->suspended) {
-		i_dev->skip_resume = true;
-		return 0;
-	}
-
 	i_dev->disable(i_dev);
 	i2c_dw_plat_prepare_clk(i_dev, false);
 
-	i_dev->suspended = true;
-
 	return 0;
 }
 
@@ -478,19 +481,9 @@ static int dw_i2c_plat_resume(struct device *dev)
 {
 	struct dw_i2c_dev *i_dev = dev_get_drvdata(dev);
 
-	if (!i_dev->suspended)
-		return 0;
-
-	if (i_dev->skip_resume) {
-		i_dev->skip_resume = false;
-		return 0;
-	}
-
 	i2c_dw_plat_prepare_clk(i_dev, true);
 	i_dev->init(i_dev);
 
-	i_dev->suspended = false;
-
 	return 0;
 }
 

From 4bf236a3330e97d275e5848420f7e31948fef07a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 5 Jan 2018 09:19:08 -0800
Subject: [PATCH 65/89] PM / sleep: Make lock/unlock_system_sleep() available
 to kernel modules

Since pm_mutex is not exported using lock/unlock_system_sleep() from
inside a kernel module causes a "pm_mutex undefined" linker error.
Hence move lock/unlock_system_sleep() into kernel/power/main.c and
export these.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/suspend.h | 28 ++--------------------------
 kernel/power/main.c     | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d60b0f5c38d5..cc22a24516d6 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -443,32 +443,8 @@ extern bool pm_save_wakeup_count(unsigned int count);
 extern void pm_wakep_autosleep_enabled(bool set);
 extern void pm_print_active_wakeup_sources(void);
 
-static inline void lock_system_sleep(void)
-{
-	current->flags |= PF_FREEZER_SKIP;
-	mutex_lock(&pm_mutex);
-}
-
-static inline void unlock_system_sleep(void)
-{
-	/*
-	 * Don't use freezer_count() because we don't want the call to
-	 * try_to_freeze() here.
-	 *
-	 * Reason:
-	 * Fundamentally, we just don't need it, because freezing condition
-	 * doesn't come into effect until we release the pm_mutex lock,
-	 * since the freezer always works with pm_mutex held.
-	 *
-	 * More importantly, in the case of hibernation,
-	 * unlock_system_sleep() gets called in snapshot_read() and
-	 * snapshot_write() when the freezing condition is still in effect.
-	 * Which means, if we use try_to_freeze() here, it would make them
-	 * enter the refrigerator, thus causing hibernation to lockup.
-	 */
-	current->flags &= ~PF_FREEZER_SKIP;
-	mutex_unlock(&pm_mutex);
-}
+extern void lock_system_sleep(void);
+extern void unlock_system_sleep(void);
 
 #else /* !CONFIG_PM_SLEEP */
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3a2ca9066583..705c2366dafe 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,6 +22,35 @@ DEFINE_MUTEX(pm_mutex);
 
 #ifdef CONFIG_PM_SLEEP
 
+void lock_system_sleep(void)
+{
+	current->flags |= PF_FREEZER_SKIP;
+	mutex_lock(&pm_mutex);
+}
+EXPORT_SYMBOL_GPL(lock_system_sleep);
+
+void unlock_system_sleep(void)
+{
+	/*
+	 * Don't use freezer_count() because we don't want the call to
+	 * try_to_freeze() here.
+	 *
+	 * Reason:
+	 * Fundamentally, we just don't need it, because freezing condition
+	 * doesn't come into effect until we release the pm_mutex lock,
+	 * since the freezer always works with pm_mutex held.
+	 *
+	 * More importantly, in the case of hibernation,
+	 * unlock_system_sleep() gets called in snapshot_read() and
+	 * snapshot_write() when the freezing condition is still in effect.
+	 * Which means, if we use try_to_freeze() here, it would make them
+	 * enter the refrigerator, thus causing hibernation to lockup.
+	 */
+	current->flags &= ~PF_FREEZER_SKIP;
+	mutex_unlock(&pm_mutex);
+}
+EXPORT_SYMBOL_GPL(unlock_system_sleep);
+
 /* Routines for PM-transition notifications */
 
 static BLOCKING_NOTIFIER_HEAD(pm_chain_head);

From 203f8c250e2195371d418b1e8466e4caf1a0ed51 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 5 Jan 2018 09:19:09 -0800
Subject: [PATCH 66/89] block, scsi: Fix race between SPI domain validation and
 system suspend

Avoid that the following warning is reported when suspending a system
that is using the mptspi driver:

WARNING: CPU: 0 PID: 4187 at drivers/scsi/scsi_lib.c:2960 scsi_device_quiesce+0x20/0xb0
EIP: scsi_device_quiesce+0x20/0xb0
Call Trace:
 spi_dv_device+0x65/0x5f0 [scsi_transport_spi]
 mptspi_dv_device+0x4d/0x170 [mptspi]
 mptspi_dv_renegotiate_work+0x49/0xc0 [mptspi]
 process_one_work+0x190/0x2e0
 worker_thread+0x37/0x3f0
 kthread+0xcb/0x100
 ret_from_fork+0x19/0x24

Fixes: 3a0a529971ec (block, scsi: Make SCSI quiesce and resume work reliably)
Reported-by: Woody Suwalski <terraluna977@gmail.com>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
[ rjw : Subject ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/scsi/scsi_transport_spi.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/scsi_transport_spi.c b/drivers/scsi/scsi_transport_spi.c
index 10ebb213ddb3..871ea582029e 100644
--- a/drivers/scsi/scsi_transport_spi.c
+++ b/drivers/scsi/scsi_transport_spi.c
@@ -26,6 +26,7 @@
 #include <linux/mutex.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
+#include <linux/suspend.h>
 #include <scsi/scsi.h>
 #include "scsi_priv.h"
 #include <scsi/scsi_device.h>
@@ -1009,11 +1010,20 @@ spi_dv_device(struct scsi_device *sdev)
 	u8 *buffer;
 	const int len = SPI_MAX_ECHO_BUFFER_SIZE*2;
 
+	/*
+	 * Because this function and the power management code both call
+	 * scsi_device_quiesce(), it is not safe to perform domain validation
+	 * while suspend or resume is in progress. Hence the
+	 * lock/unlock_system_sleep() calls.
+	 */
+	lock_system_sleep();
+
 	if (unlikely(spi_dv_in_progress(starget)))
-		return;
+		goto unlock;
 
 	if (unlikely(scsi_device_get(sdev)))
-		return;
+		goto unlock;
+
 	spi_dv_in_progress(starget) = 1;
 
 	buffer = kzalloc(len, GFP_KERNEL);
@@ -1049,6 +1059,8 @@ spi_dv_device(struct scsi_device *sdev)
  out_put:
 	spi_dv_in_progress(starget) = 0;
 	scsi_device_put(sdev);
+unlock:
+	unlock_system_sleep();
 }
 EXPORT_SYMBOL(spi_dv_device);
 

From fbe313884d7ddd73ce457473cbdf3763f5b1d3da Mon Sep 17 00:00:00 2001
From: Doug Smythies <doug.smythies@gmail.com>
Date: Fri, 5 Jan 2018 14:31:16 -0800
Subject: [PATCH 67/89] tools/power/x86/intel_pstate_tracer: Free the trace
 buffer memory

The trace buffer memory should be, mostly, freed after
the buffer has been output.

This patch is required before a future patch that will allow
the user to override the default, and specify the trace buffer
memory allocation as a command line option.

Signed-off-by: Doug Smythies <dsmythies@telus.net>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../intel_pstate_tracer/intel_pstate_tracer.py    | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py b/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py
index 0b24dd9d01ff..29f50d4cfea0 100755
--- a/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py
+++ b/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py
@@ -411,6 +411,16 @@ def set_trace_buffer_size():
         print('IO error setting trace buffer size ')
         quit()
 
+def free_trace_buffer():
+    """ Free the trace buffer memory """
+
+    try:
+       open('/sys/kernel/debug/tracing/buffer_size_kb'
+                 , 'w').write("1")
+    except:
+        print('IO error setting trace buffer size ')
+        quit()
+
 def read_trace_data(filename):
     """ Read and parse trace data """
 
@@ -583,4 +593,9 @@ for root, dirs, files in os.walk('.'):
     for f in files:
         fix_ownership(f)
 
+clear_trace_file()
+# Free the memory
+if interval:
+    free_trace_buffer()
+
 os.chdir('../../')

From c9619bb293c9ab758ba298f039cf4b820dd8436f Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Mon, 8 Jan 2018 10:04:50 +0800
Subject: [PATCH 68/89] ARM: dts: imx6ul: add 696MHz operating point

Add 696MHz operating point according to datasheet
(Rev. 0, 12/2015).

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Reviewed-by: Fabio Estevam <fabio.estevam@nxp.com>
Acked-by: Dong Aisheng <aisheng.dong@nxp.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/arm/boot/dts/imx6ul.dtsi | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/boot/dts/imx6ul.dtsi b/arch/arm/boot/dts/imx6ul.dtsi
index d5181f85ca9c..963e1698fe1d 100644
--- a/arch/arm/boot/dts/imx6ul.dtsi
+++ b/arch/arm/boot/dts/imx6ul.dtsi
@@ -68,12 +68,14 @@
 			clock-latency = <61036>; /* two CLK32 periods */
 			operating-points = <
 				/* kHz	uV */
+				696000	1275000
 				528000	1175000
 				396000	1025000
 				198000	950000
 			>;
 			fsl,soc-operating-points = <
 				/* KHz	uV */
+				696000	1275000
 				528000	1175000
 				396000	1175000
 				198000	1175000

From 5028f5d2b38ea68531d6b265b64e1741a141a828 Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Mon, 8 Jan 2018 10:04:51 +0800
Subject: [PATCH 69/89] cpufreq: imx6q: add 696MHz operating point for i.mx6ul

Add 696MHz operating point for i.MX6UL, only for those
parts with speed grading fuse set to 2b'10 supports
696MHz operating point, so, speed grading check is also
added for i.MX6UL in this patch, the clock tree for each
operating point are as below:

696MHz:
    pll1                       696000000
       pll1_bypass             696000000
          pll1_sys             696000000
             pll1_sw           696000000
                arm            696000000
528MHz:
    pll2                       528000000
       pll2_bypass             528000000
          pll2_bus             528000000
             ca7_secondary_sel 528000000
                step           528000000
                   pll1_sw     528000000
                      arm      528000000
396MHz:
    pll2_pfd2_396m             396000000
       ca7_secondary_sel       396000000
          step                 396000000
             pll1_sw           396000000
                arm            396000000
198MHz:
    pll2_pfd2_396m             396000000
       ca7_secondary_sel       396000000
          step                 396000000
             pll1_sw           396000000
                arm            198000000

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Reviewed-by: Fabio Estevam <fabio.estevam@nxp.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Dong Aisheng <aisheng.dong@nxp.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/imx6q-cpufreq.c | 46 ++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 8bfb0775662b..741f22e5cee3 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -136,6 +136,10 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index)
 				       clks[PLL2_PFD2_396M].clk);
 		clk_set_parent(clks[STEP].clk, clks[SECONDARY_SEL].clk);
 		clk_set_parent(clks[PLL1_SW].clk, clks[STEP].clk);
+		if (freq_hz > clk_get_rate(clks[PLL2_BUS].clk)) {
+			clk_set_rate(clks[PLL1_SYS].clk, new_freq * 1000);
+			clk_set_parent(clks[PLL1_SW].clk, clks[PLL1_SYS].clk);
+		}
 	} else {
 		clk_set_parent(clks[STEP].clk, clks[PLL2_PFD2_396M].clk);
 		clk_set_parent(clks[PLL1_SW].clk, clks[STEP].clk);
@@ -260,6 +264,43 @@ put_node:
 	of_node_put(np);
 }
 
+#define OCOTP_CFG3_6UL_SPEED_696MHZ	0x2
+
+static void imx6ul_opp_check_speed_grading(struct device *dev)
+{
+	struct device_node *np;
+	void __iomem *base;
+	u32 val;
+
+	np = of_find_compatible_node(NULL, NULL, "fsl,imx6ul-ocotp");
+	if (!np)
+		return;
+
+	base = of_iomap(np, 0);
+	if (!base) {
+		dev_err(dev, "failed to map ocotp\n");
+		goto put_node;
+	}
+
+	/*
+	 * Speed GRADING[1:0] defines the max speed of ARM:
+	 * 2b'00: Reserved;
+	 * 2b'01: 528000000Hz;
+	 * 2b'10: 696000000Hz;
+	 * 2b'11: Reserved;
+	 * We need to set the max speed of ARM according to fuse map.
+	 */
+	val = readl_relaxed(base + OCOTP_CFG3);
+	val >>= OCOTP_CFG3_SPEED_SHIFT;
+	val &= 0x3;
+	if (val != OCOTP_CFG3_6UL_SPEED_696MHZ)
+		if (dev_pm_opp_disable(dev, 696000000))
+			dev_warn(dev, "failed to disable 696MHz OPP\n");
+	iounmap(base);
+put_node:
+	of_node_put(np);
+}
+
 static int imx6q_cpufreq_probe(struct platform_device *pdev)
 {
 	struct device_node *np;
@@ -314,7 +355,10 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev)
 		goto put_reg;
 	}
 
-	imx6q_opp_check_speed_grading(cpu_dev);
+	if (of_machine_is_compatible("fsl,imx6ul"))
+		imx6ul_opp_check_speed_grading(cpu_dev);
+	else
+		imx6q_opp_check_speed_grading(cpu_dev);
 
 	/* Because we have added the OPPs here, we must free them */
 	free_opp = true;

From 52b3672c135416d8ccc3c92f475b983a201fc6f5 Mon Sep 17 00:00:00 2001
From: Zhen Han <zhen.han@intel.com>
Date: Wed, 10 Jan 2018 08:38:23 +0800
Subject: [PATCH 70/89] powercap: add suspend and resume mechanism for SOC
 power limit

PL1 and PL2 could be throlled or de-throttled by
Thermal management to control SOC temperature.

However, currently, their value will be reset to default value
after once system suspend and resume.

Add pm_notifier to save PL1, PL2 before system suspect and restore
PL1, PL2 after system resume.

Signed-off-by: Zhen Han <zhen.han@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 97 +++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index d1694f1def72..0188cff98cdd 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -29,6 +29,7 @@
 #include <linux/sysfs.h>
 #include <linux/cpu.h>
 #include <linux/powercap.h>
+#include <linux/suspend.h>
 #include <asm/iosf_mbi.h>
 
 #include <asm/processor.h>
@@ -155,6 +156,7 @@ struct rapl_power_limit {
 	int prim_id; /* primitive ID used to enable */
 	struct rapl_domain *domain;
 	const char *name;
+	u64 last_power_limit;
 };
 
 static const char pl1_name[] = "long_term";
@@ -1533,6 +1535,92 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 
 static enum cpuhp_state pcap_rapl_online;
 
+static void power_limit_state_save(void)
+{
+	struct rapl_package *rp;
+	struct rapl_domain *rd;
+	int nr_pl, ret, i;
+
+	get_online_cpus();
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		if (!rp->power_zone)
+			continue;
+		rd = power_zone_to_rapl_domain(rp->power_zone);
+		nr_pl = find_nr_power_limit(rd);
+		for (i = 0; i < nr_pl; i++) {
+			switch (rd->rpl[i].prim_id) {
+			case PL1_ENABLE:
+				ret = rapl_read_data_raw(rd,
+						POWER_LIMIT1,
+						true,
+						&rd->rpl[i].last_power_limit);
+				if (ret)
+					rd->rpl[i].last_power_limit = 0;
+				break;
+			case PL2_ENABLE:
+				ret = rapl_read_data_raw(rd,
+						POWER_LIMIT2,
+						true,
+						&rd->rpl[i].last_power_limit);
+				if (ret)
+					rd->rpl[i].last_power_limit = 0;
+				break;
+			}
+		}
+	}
+	put_online_cpus();
+}
+
+static void power_limit_state_restore(void)
+{
+	struct rapl_package *rp;
+	struct rapl_domain *rd;
+	int nr_pl, i;
+
+	get_online_cpus();
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		if (!rp->power_zone)
+			continue;
+		rd = power_zone_to_rapl_domain(rp->power_zone);
+		nr_pl = find_nr_power_limit(rd);
+		for (i = 0; i < nr_pl; i++) {
+			switch (rd->rpl[i].prim_id) {
+			case PL1_ENABLE:
+				if (rd->rpl[i].last_power_limit)
+					rapl_write_data_raw(rd,
+						POWER_LIMIT1,
+						rd->rpl[i].last_power_limit);
+				break;
+			case PL2_ENABLE:
+				if (rd->rpl[i].last_power_limit)
+					rapl_write_data_raw(rd,
+						POWER_LIMIT2,
+						rd->rpl[i].last_power_limit);
+				break;
+			}
+		}
+	}
+	put_online_cpus();
+}
+
+static int rapl_pm_callback(struct notifier_block *nb,
+	unsigned long mode, void *_unused)
+{
+	switch (mode) {
+	case PM_SUSPEND_PREPARE:
+		power_limit_state_save();
+		break;
+	case PM_POST_SUSPEND:
+		power_limit_state_restore();
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rapl_pm_notifier = {
+	.notifier_call = rapl_pm_callback,
+};
+
 static int __init rapl_init(void)
 {
 	const struct x86_cpu_id *id;
@@ -1560,8 +1648,16 @@ static int __init rapl_init(void)
 
 	/* Don't bail out if PSys is not supported */
 	rapl_register_psys();
+
+	ret = register_pm_notifier(&rapl_pm_notifier);
+	if (ret)
+		goto err_unreg_all;
+
 	return 0;
 
+err_unreg_all:
+	cpuhp_remove_state(pcap_rapl_online);
+
 err_unreg:
 	rapl_unregister_powercap();
 	return ret;
@@ -1569,6 +1665,7 @@ err_unreg:
 
 static void __exit rapl_exit(void)
 {
+	unregister_pm_notifier(&rapl_pm_notifier);
 	cpuhp_remove_state(pcap_rapl_online);
 	rapl_unregister_powercap();
 }

From c23bd3877bc21d830fa650570fc1a88bea82ecd2 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 9 Jan 2018 10:03:39 +0100
Subject: [PATCH 71/89] PM / core: Re-structure code for clearing the
 direct_complete flag

To make the code more consistent, let's clear the parent's direct_complete
flag along with clearing it for suppliers, instead of as currently, when
propagating the wakeup_path flag to parents.

While changing this, let's take the opportunity to rename the affected
internal functions, to make them self-explanatory. Like this:

dpm_clear_suppliers_direct_complete -> dpm_clear_superiors_direct_complete
dpm_propagate_to_parent -> dpm_propagate_wakeup_to_parent

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/main.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index ebcec7e677ba..720e36ec84ac 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1660,7 +1660,7 @@ static int legacy_suspend(struct device *dev, pm_message_t state,
 	return error;
 }
 
-static void dpm_propagate_to_parent(struct device *dev)
+static void dpm_propagate_wakeup_to_parent(struct device *dev)
 {
 	struct device *parent = dev->parent;
 
@@ -1669,18 +1669,23 @@ static void dpm_propagate_to_parent(struct device *dev)
 
 	spin_lock_irq(&parent->power.lock);
 
-	parent->power.direct_complete = false;
 	if (dev->power.wakeup_path && !parent->power.ignore_children)
 		parent->power.wakeup_path = true;
 
 	spin_unlock_irq(&parent->power.lock);
 }
 
-static void dpm_clear_suppliers_direct_complete(struct device *dev)
+static void dpm_clear_superiors_direct_complete(struct device *dev)
 {
 	struct device_link *link;
 	int idx;
 
+	if (dev->parent) {
+		spin_lock_irq(&dev->parent->power.lock);
+		dev->parent->power.direct_complete = false;
+		spin_unlock_irq(&dev->parent->power.lock);
+	}
+
 	idx = device_links_read_lock();
 
 	list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) {
@@ -1791,8 +1796,8 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 		if (device_may_wakeup(dev))
 			dev->power.wakeup_path = true;
 
-		dpm_propagate_to_parent(dev);
-		dpm_clear_suppliers_direct_complete(dev);
+		dpm_propagate_wakeup_to_parent(dev);
+		dpm_clear_superiors_direct_complete(dev);
 	}
 
 	device_unlock(dev);

From 0a99d767a9b0aae6e0fd983c889c793e4c91684c Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 9 Jan 2018 10:03:40 +0100
Subject: [PATCH 72/89] PM / core: Propagate wakeup_path status flag in
 __device_suspend_late()

Currently the wakeup_path status flag becomes propagated from a child
device to its parent device at __device_suspend(). This allows a driver
dealing with a parent device to act on the flag from its ->suspend()
callback.

However, in situations when the wakeup_path status flag needs to be set
from a ->suspend_late() callback, its value doesn't get propagated to the
parent by the PM core. Let's address this limitation, by also propagating
the flag at __device_suspend_late().

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/main.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 720e36ec84ac..02a497e7c785 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1447,6 +1447,21 @@ int dpm_suspend_noirq(pm_message_t state)
 	return ret;
 }
 
+static void dpm_propagate_wakeup_to_parent(struct device *dev)
+{
+	struct device *parent = dev->parent;
+
+	if (!parent)
+		return;
+
+	spin_lock_irq(&parent->power.lock);
+
+	if (dev->power.wakeup_path && !parent->power.ignore_children)
+		parent->power.wakeup_path = true;
+
+	spin_unlock_irq(&parent->power.lock);
+}
+
 static pm_callback_t dpm_subsys_suspend_late_cb(struct device *dev,
 						pm_message_t state,
 						const char **info_p)
@@ -1527,6 +1542,7 @@ Run:
 		async_error = error;
 		goto Complete;
 	}
+	dpm_propagate_wakeup_to_parent(dev);
 
 Skip:
 	dev->power.is_late_suspended = true;
@@ -1660,21 +1676,6 @@ static int legacy_suspend(struct device *dev, pm_message_t state,
 	return error;
 }
 
-static void dpm_propagate_wakeup_to_parent(struct device *dev)
-{
-	struct device *parent = dev->parent;
-
-	if (!parent)
-		return;
-
-	spin_lock_irq(&parent->power.lock);
-
-	if (dev->power.wakeup_path && !parent->power.ignore_children)
-		parent->power.wakeup_path = true;
-
-	spin_unlock_irq(&parent->power.lock);
-}
-
 static void dpm_clear_superiors_direct_complete(struct device *dev)
 {
 	struct device_link *link;

From a935424bb658f9ca37eb5e94119b857998341356 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 10 Jan 2018 21:31:56 +0100
Subject: [PATCH 73/89] PM / domains: Don't skip driver's
 ->suspend|resume_noirq() callbacks

Commit 10da65423fdb (PM / Domains: Call driver's noirq callbacks)
started to respect driver's noirq callbacks, but while doing that it
also introduced a few potential problems.

More precisely, in genpd_finish_suspend() and genpd_resume_noirq()
the noirq callbacks at the driver level should be invoked, no matter
of whether dev->power.wakeup_path is set or not.

Additionally, the commit in question also made genpd_resume_noirq()
to ignore the return value from pm_runtime_force_resume().

Let's fix both these issues!

Fixes: 10da65423fdb (PM / Domains: Call driver's noirq callbacks)
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index f9dcc981b6b9..48255ce7c0ad 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1032,15 +1032,12 @@ static int genpd_prepare(struct device *dev)
 static int genpd_finish_suspend(struct device *dev, bool poweroff)
 {
 	struct generic_pm_domain *genpd;
-	int ret;
+	int ret = 0;
 
 	genpd = dev_to_genpd(dev);
 	if (IS_ERR(genpd))
 		return -EINVAL;
 
-	if (dev->power.wakeup_path && genpd_is_active_wakeup(genpd))
-		return 0;
-
 	if (poweroff)
 		ret = pm_generic_poweroff_noirq(dev);
 	else
@@ -1048,10 +1045,18 @@ static int genpd_finish_suspend(struct device *dev, bool poweroff)
 	if (ret)
 		return ret;
 
+	if (dev->power.wakeup_path && genpd_is_active_wakeup(genpd))
+		return 0;
+
 	if (genpd->dev_ops.stop && genpd->dev_ops.start) {
 		ret = pm_runtime_force_suspend(dev);
-		if (ret)
+		if (ret) {
+			if (poweroff)
+				pm_generic_restore_noirq(dev);
+			else
+				pm_generic_resume_noirq(dev);
 			return ret;
+		}
 	}
 
 	genpd_lock(genpd);
@@ -1085,7 +1090,7 @@ static int genpd_suspend_noirq(struct device *dev)
 static int genpd_resume_noirq(struct device *dev)
 {
 	struct generic_pm_domain *genpd;
-	int ret = 0;
+	int ret;
 
 	dev_dbg(dev, "%s()\n", __func__);
 
@@ -1094,21 +1099,20 @@ static int genpd_resume_noirq(struct device *dev)
 		return -EINVAL;
 
 	if (dev->power.wakeup_path && genpd_is_active_wakeup(genpd))
-		return 0;
+		return pm_generic_resume_noirq(dev);
 
 	genpd_lock(genpd);
 	genpd_sync_power_on(genpd, true, 0);
 	genpd->suspended_count--;
 	genpd_unlock(genpd);
 
-	if (genpd->dev_ops.stop && genpd->dev_ops.start)
+	if (genpd->dev_ops.stop && genpd->dev_ops.start) {
 		ret = pm_runtime_force_resume(dev);
+		if (ret)
+			return ret;
+	}
 
-	ret = pm_generic_resume_noirq(dev);
-	if (ret)
-		return ret;
-
-	return ret;
+	return pm_generic_resume_noirq(dev);
 }
 
 /**

From 0026cef067d2962ed064b974e07f017233d5bd5a Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 11 Jan 2018 09:18:59 +0100
Subject: [PATCH 74/89] PM / wakeup: Print warn if device gets enabled as
 wakeup source during sleep

In general, wakeup settings are not supposed to be changed during any of
the system wide PM phases. The reason is simply that it would break
guarantees provided by the PM core, to properly act on active wakeup
sources.

However, there are exceptions to when, in particular, disabling a device as
wakeup source makes sense. For example, in cases when a driver realizes
that its device is dead during system suspend. For these scenarios, we
don't need to care about acting on the wakeup source correctly, because a
dead device shouldn't deliver wakeup signals.

To this reasoning and to help users to properly manage wakeup settings,
let's print a warning in cases someone calls device_wakeup_enable() during
system sleep.

Suggested-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
[ rjw: Message to be printed ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index e73a081c6397..ea01621ed769 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -19,6 +19,11 @@
 
 #include "power.h"
 
+#ifndef CONFIG_SUSPEND
+suspend_state_t pm_suspend_target_state;
+#define pm_suspend_target_state	(PM_SUSPEND_ON)
+#endif
+
 /*
  * If set, the suspend/hibernate code will abort transitions to a sleep state
  * if wakeup events are registered during or immediately before the transition.
@@ -268,6 +273,9 @@ int device_wakeup_enable(struct device *dev)
 	if (!dev || !dev->power.can_wakeup)
 		return -EINVAL;
 
+	if (pm_suspend_target_state != PM_SUSPEND_ON)
+		dev_dbg(dev, "Suspicious %s() during system transition!\n", __func__);
+
 	ws = wakeup_source_register(dev_name(dev));
 	if (!ws)
 		return -ENOMEM;

From 29a5a6d7082427371519ae1e186d9e35612801fb Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 11 Jan 2018 02:13:58 +0100
Subject: [PATCH 75/89] ACPI / PM: Use Low Power S0 Idle on more systems

Some systems don't support the ACPI_LPS0_ENTRY and ACPI_LPS0_EXIT
functions in their Low Power S0 Idle _DSM, but still expect EC
events to be processed in the suspend-to-idle state for power button
wakeup (among other things) to work.  Surface Pro3 turns out to be
one of them.

Fortunately, it still provides Low Power S0 Idle _DSM with the screen
on/off functions supported, so modify the ACPI suspend-to-idle to use
the Low Power S0 Idle code path for all systems supporting the
ACPI_LPS0_ENTRY and ACPI_LPS0_EXIT or the ACPI_LPS0_SCREEN_OFF and
ACPI_LPS0_SCREEN_ON functions in their Low Power S0 Idle _DSM.

Potentially, that will cause more systems to use suspend-to-idle by
default, so some future corrections may be necessary if it leads
to issues, but let it remain more straightforward for now.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=198389#add_comment
Reported-by: Valentin Manea <valy@mrs.ro>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Tested-by: Valentin Manea <valy@mrs.ro>
---
 drivers/acpi/sleep.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 15cd862a87c2..46cde0912762 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -707,7 +707,8 @@ static const struct acpi_device_id lps0_device_ids[] = {
 #define ACPI_LPS0_ENTRY		5
 #define ACPI_LPS0_EXIT		6
 
-#define ACPI_S2IDLE_FUNC_MASK	((1 << ACPI_LPS0_ENTRY) | (1 << ACPI_LPS0_EXIT))
+#define ACPI_LPS0_SCREEN_MASK	((1 << ACPI_LPS0_SCREEN_OFF) | (1 << ACPI_LPS0_SCREEN_ON))
+#define ACPI_LPS0_PLATFORM_MASK	((1 << ACPI_LPS0_ENTRY) | (1 << ACPI_LPS0_EXIT))
 
 static acpi_handle lps0_device_handle;
 static guid_t lps0_dsm_guid;
@@ -910,7 +911,8 @@ static int lps0_device_attach(struct acpi_device *adev,
 	if (out_obj && out_obj->type == ACPI_TYPE_BUFFER) {
 		char bitmask = *(char *)out_obj->buffer.pointer;
 
-		if ((bitmask & ACPI_S2IDLE_FUNC_MASK) == ACPI_S2IDLE_FUNC_MASK) {
+		if ((bitmask & ACPI_LPS0_PLATFORM_MASK) == ACPI_LPS0_PLATFORM_MASK ||
+		    (bitmask & ACPI_LPS0_SCREEN_MASK) == ACPI_LPS0_SCREEN_MASK) {
 			lps0_dsm_func_mask = bitmask;
 			lps0_device_handle = adev->handle;
 			/*

From 19351f340765ebef48d07eade8ffb5f6f1118244 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 10 Jan 2018 13:26:35 +0100
Subject: [PATCH 76/89] platform/x86: surfacepro3: Support for wakeup from
 suspend-to-idle

Modify surface_button_notify() to make it wake up the system from
suspend-to-idle (by reporting "hard" wakeup events while suspended)
and add wakeup initialization to surface_button_add() for wakeup
events reported by this driver to work at all.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=198389
Reported-by: Valentin Manea <valy@mrs.ro>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Tested-by: Valentin Manea <valy@mrs.ro>
---
 drivers/platform/x86/surfacepro3_button.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/surfacepro3_button.c b/drivers/platform/x86/surfacepro3_button.c
index 6505c97705e1..1b491690ce07 100644
--- a/drivers/platform/x86/surfacepro3_button.c
+++ b/drivers/platform/x86/surfacepro3_button.c
@@ -119,7 +119,7 @@ static void surface_button_notify(struct acpi_device *device, u32 event)
 	if (key_code == KEY_RESERVED)
 		return;
 	if (pressed)
-		pm_wakeup_event(&device->dev, 0);
+		pm_wakeup_dev_event(&device->dev, 0, button->suspended);
 	if (button->suspended)
 		return;
 	input_report_key(input, key_code, pressed?1:0);
@@ -185,6 +185,8 @@ static int surface_button_add(struct acpi_device *device)
 	error = input_register_device(input);
 	if (error)
 		goto err_free_input;
+
+	device_init_wakeup(&device->dev, true);
 	dev_info(&device->dev,
 			"%s [%s]\n", name, acpi_device_bid(device));
 	return 0;

From dbd49b85eec7eb6d7ae61bad8306d5cdd85c142d Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 10 Jan 2018 11:38:51 -0800
Subject: [PATCH 77/89] cpufreq: intel_pstate: Replace bxt_funcs with
 core_funcs

Since core_funcs and bxt_funcs have same set of callbacks, replace
bxt_funcs with core_funcs.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 93a0e88bef76..3b6616b15c59 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1595,15 +1595,6 @@ static const struct pstate_funcs knl_funcs = {
 	.get_val = core_get_val,
 };
 
-static const struct pstate_funcs bxt_funcs = {
-	.get_max = core_get_max_pstate,
-	.get_max_physical = core_get_max_pstate_physical,
-	.get_min = core_get_min_pstate,
-	.get_turbo = core_get_turbo_pstate,
-	.get_scaling = core_get_scaling,
-	.get_val = core_get_val,
-};
-
 #define ICPU(model, policy) \
 	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\
 			(unsigned long)&policy }
@@ -1627,8 +1618,8 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
 	ICPU(INTEL_FAM6_BROADWELL_XEON_D,	core_funcs),
 	ICPU(INTEL_FAM6_XEON_PHI_KNL,		knl_funcs),
 	ICPU(INTEL_FAM6_XEON_PHI_KNM,		knl_funcs),
-	ICPU(INTEL_FAM6_ATOM_GOLDMONT,		bxt_funcs),
-	ICPU(INTEL_FAM6_ATOM_GEMINI_LAKE,       bxt_funcs),
+	ICPU(INTEL_FAM6_ATOM_GOLDMONT,		core_funcs),
+	ICPU(INTEL_FAM6_ATOM_GEMINI_LAKE,       core_funcs),
 	{}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);

From d8de7a44e11f98f2c2a4c2e12e79ba9ffb839306 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 10 Jan 2018 11:38:52 -0800
Subject: [PATCH 78/89] cpufreq: intel_pstate: Add Skylake servers support

Currently intel_pstate can function only in HWP mode on Skylake servers.
When HWP feature is not enabled on the processor then acpi-cpufreq is
driver is used.

Based on the power and performance tests using intel_pstate scaling
algorithm the results are comparable. But intel_pstate brings in
additional features:
 - Display of turbo frequency range, which many users like to see
 - Place limits in the turbo frequency range when platform allows

Since these tests are done only using non PID algorithm introduced in
kernel version 4.14, this patch is not a backport candidate. So each user
has to carefully weigh the benefits before he backports.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 3b6616b15c59..7edf7a0e5a96 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1620,6 +1620,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
 	ICPU(INTEL_FAM6_XEON_PHI_KNM,		knl_funcs),
 	ICPU(INTEL_FAM6_ATOM_GOLDMONT,		core_funcs),
 	ICPU(INTEL_FAM6_ATOM_GEMINI_LAKE,       core_funcs),
+	ICPU(INTEL_FAM6_SKYLAKE_X,		core_funcs),
 	{}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);

From 3fa4680b860bf48b437d6a2c039789c4abe202ae Mon Sep 17 00:00:00 2001
From: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Date: Fri, 12 Jan 2018 12:43:53 +0530
Subject: [PATCH 79/89] cpufreq: powernv: Dont assume distinct pstate values
 for nominal and pmin

Some OpenPOWER boxes can have same pstate values for nominal and
pmin pstates. In these boxes the current code will not initialize
'powernv_pstate_info.min' variable and result in erroneous CPU
frequency reporting. This patch fixes this problem.

Fixes: 09ca4c9b5958 (cpufreq: powernv: Replacing pstate_id with frequency table index)
Reported-by: Alvin Wang <wangat@tw.ibm.com>
Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: 4.8+ <stable@vger.kernel.org> # 4.8+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 8a4e2ce0804c..29cdec198657 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -337,9 +337,9 @@ next:
 
 		if (id == pstate_max)
 			powernv_pstate_info.max = i;
-		else if (id == pstate_nominal)
+		if (id == pstate_nominal)
 			powernv_pstate_info.nominal = i;
-		else if (id == pstate_min)
+		if (id == pstate_min)
 			powernv_pstate_info.min = i;
 
 		if (powernv_pstate_info.wof_enabled && id == pstate_turbo) {

From 17218e0092f8c7b7edce7ff08c8b23212eac7271 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 12 Jan 2018 14:10:38 +0100
Subject: [PATCH 80/89] PM / genpd: Stop/start devices without
 pm_runtime_force_suspend/resume()

There are problems with calling pm_runtime_force_suspend/resume()
to "stop" and "start" devices in genpd_finish_suspend() and
genpd_resume_noirq() (and in analogous hibernation-specific genpd
callbacks) after commit 122a22377a3d (PM / Domains: Stop/start
devices during system PM suspend/resume in genpd) as those routines
do much more than just "stopping" and "starting" devices (which was
the stated purpose of that commit) unnecessarily and may not play
well with system-wide PM driver callbacks.

First, consider the pm_runtime_force_suspend() in
genpd_finish_suspend().  If the current runtime PM status of the
device is "suspended", that function most likely does the right thing
by ignoring the device, because it should have been "stopped" already
and whatever needed to be done to deactivate it shoud have been done.
In turn, if the runtime PM status of the device is "active",
genpd_runtime_suspend() is called for it (indirectly) and (1) runs
the ->runtime_suspend callback provided by the device's driver
(assuming no bus type with ->runtime_suspend of its own), (2) "stops"
the device and (3) checks if the domain can be powered down, and then
(4) the device's runtime PM status is changed to "suspended".  Out of
the four actions above (1) is not necessary and it may be outright
harmful, (3) is pointless and (4) is questionable.  The only
operation that needs to be carried out here is (2).

The reason why (1) is not necessary is because the system-wide
PM callbacks provided by the device driver for the transition in
question have been run and they should have taken care of the
driver's part of device suspend already.  Moreover, it may be
harmful, because the ->runtime_suspend callback may want to
access the device which is partially suspended at that point
and may not be responsive.  Also, system-wide PM callbacks may
have been run already (in the previous phases of the system
transition under way) for the device's parent or for its supplier
devices (if any) and the device may not be accessible because of
that.

There also is no reason to do (3), because genpd_finish_suspend()
will repeat it anyway, and (4) potentially causes confusion to ensue
during the subsequent system transition to the working state.

Consider pm_runtime_force_resume() in genpd_resume_noirq() now.
It runs genpd_runtime_resume() for all devices with runtime PM
status set to "suspended", which includes all of the devices
whose runtime PM status was changed by pm_runtime_force_suspend()
before and may include some devices already suspended when the
pm_runtime_force_suspend() was running, which may be confusing.  The
genpd_runtime_resume() first tries to power up the domain, which
(again) is pointless, because genpd_resume_noirq() has done that
already.  Then, it "starts" the device and runs the ->runtime_resume
callback (from the driver, say) for it.  If all is well, the device
is left with the runtime PM status set to "active".

Unfortunately, running the driver's ->runtime_resume callback
before its system-wide PM callbacks and possibly before some
system-wide PM callbacks of the parent device's driver (let
alone supplier drivers) is asking for trouble, especially if
the device had been suspended before pm_runtime_force_suspend()
ran previously or if the callbacks in question expect to be run
back-to-back with their suspend-side counterparts.  It also should
not be necessary, because the system-wide PM driver callbacks that
will be invoked for the device subsequently should take care of
resuming it just fine.

[Running the driver's ->runtime_resume callback in the "noirq"
phase of the transition to the working state may be problematic
even for devices whose drivers do use pm_runtime_force_resume()
in (or as) their system-wide PM callbacks if they have suppliers
other than their parents, because it may cause the supplier to
be resumed after the consumer in some cases.]

Because of the above, modify genpd as follows:

 1. Change genpd_finish_suspend() to only "stop" devices with
    runtime PM status set to "active" (without invoking runtime PM
    callbacks for them, changing their runtime PM status and so on).

    That doesn't change the handling of devices whose drivers use
    pm_runtime_force_suspend/resume() in (or as) their system-wide
    PM callbacks and addresses the issues described above for the
    other devices.

 2. Change genpd_resume_noirq() to only "start" devices with
    runtime PM status set to "active" (without invoking runtime PM
    callbacks for them, changing their runtime PM status and so on).

    Again, that doesn't change the handling of devices whose drivers
    use pm_runtime_force_suspend/resume() in (or as) their system-wide
    PM callbacks and addresses the described issues for the other
    devices.  Devices with runtime PM status set to "suspended"
    are not started with the assumption that they will be resumed
    later, either by pm_runtime_force_resume() or via runtime PM.

 3. Change genpd_restore_noirq() to follow genpd_resume_noirq().

    That causes devices already suspended before hibernation to be
    left alone (which also is the case without the change) and
    avoids running the ->runtime_resume driver callback too early
    for the other devices.

 4. Change genpd_freeze_noirq() and genpd_thaw_noirq() in accordance
    with the above modifications.

Fixes: 122a22377a3d (PM / Domains: Stop/start devices during system PM suspend/resume in genpd)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/base/power/domain.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 48255ce7c0ad..528b24149bc7 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1048,8 +1048,9 @@ static int genpd_finish_suspend(struct device *dev, bool poweroff)
 	if (dev->power.wakeup_path && genpd_is_active_wakeup(genpd))
 		return 0;
 
-	if (genpd->dev_ops.stop && genpd->dev_ops.start) {
-		ret = pm_runtime_force_suspend(dev);
+	if (genpd->dev_ops.stop && genpd->dev_ops.start &&
+	    !pm_runtime_status_suspended(dev)) {
+		ret = genpd_stop_dev(genpd, dev);
 		if (ret) {
 			if (poweroff)
 				pm_generic_restore_noirq(dev);
@@ -1106,8 +1107,9 @@ static int genpd_resume_noirq(struct device *dev)
 	genpd->suspended_count--;
 	genpd_unlock(genpd);
 
-	if (genpd->dev_ops.stop && genpd->dev_ops.start) {
-		ret = pm_runtime_force_resume(dev);
+	if (genpd->dev_ops.stop && genpd->dev_ops.start &&
+	    !pm_runtime_status_suspended(dev)) {
+		ret = genpd_start_dev(genpd, dev);
 		if (ret)
 			return ret;
 	}
@@ -1139,8 +1141,9 @@ static int genpd_freeze_noirq(struct device *dev)
 	if (ret)
 		return ret;
 
-	if (genpd->dev_ops.stop && genpd->dev_ops.start)
-		ret = pm_runtime_force_suspend(dev);
+	if (genpd->dev_ops.stop && genpd->dev_ops.start &&
+	    !pm_runtime_status_suspended(dev))
+		ret = genpd_stop_dev(genpd, dev);
 
 	return ret;
 }
@@ -1163,8 +1166,9 @@ static int genpd_thaw_noirq(struct device *dev)
 	if (IS_ERR(genpd))
 		return -EINVAL;
 
-	if (genpd->dev_ops.stop && genpd->dev_ops.start) {
-		ret = pm_runtime_force_resume(dev);
+	if (genpd->dev_ops.stop && genpd->dev_ops.start &&
+	    !pm_runtime_status_suspended(dev)) {
+		ret = genpd_start_dev(genpd, dev);
 		if (ret)
 			return ret;
 	}
@@ -1221,8 +1225,9 @@ static int genpd_restore_noirq(struct device *dev)
 	genpd_sync_power_on(genpd, true, 0);
 	genpd_unlock(genpd);
 
-	if (genpd->dev_ops.stop && genpd->dev_ops.start) {
-		ret = pm_runtime_force_resume(dev);
+	if (genpd->dev_ops.stop && genpd->dev_ops.start &&
+	    !pm_runtime_status_suspended(dev)) {
+		ret = genpd_start_dev(genpd, dev);
 		if (ret)
 			return ret;
 	}

From 4918e1f87c5fb7fc8f73a7d8fb118beeb94e05f7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 12 Jan 2018 14:12:05 +0100
Subject: [PATCH 81/89] PM / runtime: Rework pm_runtime_force_suspend/resume()

One of the limitations of pm_runtime_force_suspend/resume() is that
if a parent driver wants to use these functions, all of its child
drivers generally have to do that too because of the parent usage
counter manipulations necessary to get the correct state of the parent
during system-wide transitions to the working state (system resume).
However, that limitation turns out to be artificial, so remove it.

Namely, pm_runtime_force_suspend() only needs to update the children
counter of its parent (if there's is a parent) when the device can
stay in suspend after the subsequent system resume transition, as
that counter is correct already otherwise.  Now, if the parent's
children counter is not updated, it is not necessary to increment
the parent's usage counter in that case any more, as long as the
children counters of devices are checked along with their usage
counters in order to decide whether or not the devices may be left
in suspend after the subsequent system resume transition.

Accordingly, modify pm_runtime_force_suspend() to only call
pm_runtime_set_suspended() for devices whose usage and children
counters are at the "no references" level (the runtime PM status
of the device needs to be updated to "suspended" anyway in case
this function is called once again for the same device during the
transition under way), drop the parent usage counter incrementation
from it and update pm_runtime_force_resume() to compensate for these
changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/base/power/runtime.c | 74 +++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 40 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 6e89b51ea3d9..84832f1a75bf 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1613,17 +1613,28 @@ void pm_runtime_drop_link(struct device *dev)
 	spin_unlock_irq(&dev->power.lock);
 }
 
+static bool pm_runtime_need_not_resume(struct device *dev)
+{
+	return atomic_read(&dev->power.usage_count) <= 1 &&
+		atomic_read(&dev->power.child_count) == 0;
+}
+
 /**
  * pm_runtime_force_suspend - Force a device into suspend state if needed.
  * @dev: Device to suspend.
  *
  * Disable runtime PM so we safely can check the device's runtime PM status and
- * if it is active, invoke it's .runtime_suspend callback to bring it into
- * suspend state. Keep runtime PM disabled to preserve the state unless we
- * encounter errors.
+ * if it is active, invoke its ->runtime_suspend callback to suspend it and
+ * change its runtime PM status field to RPM_SUSPENDED.  Also, if the device's
+ * usage and children counters don't indicate that the device was in use before
+ * the system-wide transition under way, decrement its parent's children counter
+ * (if there is a parent).  Keep runtime PM disabled to preserve the state
+ * unless we encounter errors.
  *
  * Typically this function may be invoked from a system suspend callback to make
- * sure the device is put into low power state.
+ * sure the device is put into low power state and it should only be used during
+ * system-wide PM transitions to sleep states.  It assumes that the analogous
+ * pm_runtime_force_resume() will be used to resume the device.
  */
 int pm_runtime_force_suspend(struct device *dev)
 {
@@ -1646,17 +1657,18 @@ int pm_runtime_force_suspend(struct device *dev)
 		goto err;
 
 	/*
-	 * Increase the runtime PM usage count for the device's parent, in case
-	 * when we find the device being used when system suspend was invoked.
-	 * This informs pm_runtime_force_resume() to resume the parent
-	 * immediately, which is needed to be able to resume its children,
-	 * when not deferring the resume to be managed via runtime PM.
+	 * If the device can stay in suspend after the system-wide transition
+	 * to the working state that will follow, drop the children counter of
+	 * its parent, but set its status to RPM_SUSPENDED anyway in case this
+	 * function will be called again for it in the meantime.
 	 */
-	if (dev->parent && atomic_read(&dev->power.usage_count) > 1)
-		pm_runtime_get_noresume(dev->parent);
+	if (pm_runtime_need_not_resume(dev))
+		pm_runtime_set_suspended(dev);
+	else
+		__update_runtime_status(dev, RPM_SUSPENDED);
 
-	pm_runtime_set_suspended(dev);
 	return 0;
+
 err:
 	pm_runtime_enable(dev);
 	return ret;
@@ -1669,13 +1681,9 @@ EXPORT_SYMBOL_GPL(pm_runtime_force_suspend);
  *
  * Prior invoking this function we expect the user to have brought the device
  * into low power state by a call to pm_runtime_force_suspend(). Here we reverse
- * those actions and brings the device into full power, if it is expected to be
- * used on system resume. To distinguish that, we check whether the runtime PM
- * usage count is greater than 1 (the PM core increases the usage count in the
- * system PM prepare phase), as that indicates a real user (such as a subsystem,
- * driver, userspace, etc.) is using it. If that is the case, the device is
- * expected to be used on system resume as well, so then we resume it. In the
- * other case, we defer the resume to be managed via runtime PM.
+ * those actions and bring the device into full power, if it is expected to be
+ * used on system resume.  In the other case, we defer the resume to be managed
+ * via runtime PM.
  *
  * Typically this function may be invoked from a system resume callback.
  */
@@ -1684,32 +1692,18 @@ int pm_runtime_force_resume(struct device *dev)
 	int (*callback)(struct device *);
 	int ret = 0;
 
-	callback = RPM_GET_CALLBACK(dev, runtime_resume);
-
-	if (!callback) {
-		ret = -ENOSYS;
-		goto out;
-	}
-
-	if (!pm_runtime_status_suspended(dev))
+	if (!pm_runtime_status_suspended(dev) || pm_runtime_need_not_resume(dev))
 		goto out;
 
 	/*
-	 * Decrease the parent's runtime PM usage count, if we increased it
-	 * during system suspend in pm_runtime_force_suspend().
-	*/
-	if (atomic_read(&dev->power.usage_count) > 1) {
-		if (dev->parent)
-			pm_runtime_put_noidle(dev->parent);
-	} else {
-		goto out;
-	}
+	 * The value of the parent's children counter is correct already, so
+	 * just update the status of the device.
+	 */
+	__update_runtime_status(dev, RPM_ACTIVE);
 
-	ret = pm_runtime_set_active(dev);
-	if (ret)
-		goto out;
+	callback = RPM_GET_CALLBACK(dev, runtime_resume);
 
-	ret = callback(dev);
+	ret = callback ? callback(dev) : -ENOSYS;
 	if (ret) {
 		pm_runtime_set_suspended(dev);
 		goto out;

From 1f5c6855260141ac3115e9a065491ee2ac07f9bc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 15 Jan 2018 01:46:25 +0100
Subject: [PATCH 82/89] PM / runtime: Check ignore_children in
 pm_runtime_need_not_resume()

Modify pm_runtime_need_not_resume() to make it avoid taking
power.child_count for devices with power.ignore_children which
is consistent with the runtime PM usage of these fields.

Suggested-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/base/power/runtime.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 84832f1a75bf..cb5e48b86453 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1616,7 +1616,8 @@ void pm_runtime_drop_link(struct device *dev)
 static bool pm_runtime_need_not_resume(struct device *dev)
 {
 	return atomic_read(&dev->power.usage_count) <= 1 &&
-		atomic_read(&dev->power.child_count) == 0;
+		(atomic_read(&dev->power.child_count) == 0 ||
+		 dev->power.ignore_children);
 }
 
 /**

From 8ffdfe35b8a67e509dee5719807b7f88ed2cda7e Mon Sep 17 00:00:00 2001
From: Kyungsik Lee <kyungsik.lee@lge.com>
Date: Tue, 16 Jan 2018 10:19:43 +0900
Subject: [PATCH 83/89] PM / hibernate: Drop unused parameter of enough_swap

Parameter flags is no longer used, remove it.

Signed-off-by: Kyungsik Lee <kyungsik.lee@lge.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/swap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 293ead59eccc..a46be1261c09 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -879,7 +879,7 @@ out_clean:
  *	space avaiable from the resume partition.
  */
 
-static int enough_swap(unsigned int nr_pages, unsigned int flags)
+static int enough_swap(unsigned int nr_pages)
 {
 	unsigned int free_swap = count_swap_pages(root_swap, 1);
 	unsigned int required;
@@ -915,7 +915,7 @@ int swsusp_write(unsigned int flags)
 		return error;
 	}
 	if (flags & SF_NOCOMPRESS_MODE) {
-		if (!enough_swap(pages, flags)) {
+		if (!enough_swap(pages)) {
 			pr_err("Not enough free swap\n");
 			error = -ENOSPC;
 			goto out_finish;

From 617fcb673090e495f58565ff0171d07abdad53a7 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 16 Jan 2018 09:01:27 +0100
Subject: [PATCH 84/89] PM / runtime: Allow no callbacks in
 pm_runtime_force_suspend|resume()

The pm_runtime_force_suspend|resume() helpers currently requires the device
to at some level (PM domain, bus, etc), have the ->runtime_suspend|resume()
callbacks assigned for it, else -ENOSYS is returned as an error.

However, there are no reason for this requirement, so let's simply remove
it by allowing these callbacks to be NULL.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index cb5e48b86453..8bef3cb2424d 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1640,7 +1640,7 @@ static bool pm_runtime_need_not_resume(struct device *dev)
 int pm_runtime_force_suspend(struct device *dev)
 {
 	int (*callback)(struct device *);
-	int ret = 0;
+	int ret;
 
 	pm_runtime_disable(dev);
 	if (pm_runtime_status_suspended(dev))
@@ -1648,12 +1648,7 @@ int pm_runtime_force_suspend(struct device *dev)
 
 	callback = RPM_GET_CALLBACK(dev, runtime_suspend);
 
-	if (!callback) {
-		ret = -ENOSYS;
-		goto err;
-	}
-
-	ret = callback(dev);
+	ret = callback ? callback(dev) : 0;
 	if (ret)
 		goto err;
 
@@ -1704,7 +1699,7 @@ int pm_runtime_force_resume(struct device *dev)
 
 	callback = RPM_GET_CALLBACK(dev, runtime_resume);
 
-	ret = callback ? callback(dev) : -ENOSYS;
+	ret = callback ? callback(dev) : 0;
 	if (ret) {
 		pm_runtime_set_suspended(dev);
 		goto out;

From 1131b0a4af911de50b22239cabdf6dcd3f15df15 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 17 Jan 2018 10:38:28 +0100
Subject: [PATCH 85/89] dmaengine: rcar-dmac: Make DMAC reinit during system
 resume explicit

The current (empty) system sleep callbacks rely on the PM core to force
a runtime resume to reinitialize the DMAC registers during system
resume.  Without a reinitialization, e.g. SCIF DMA will hang silently
after a system resume on R-Car Gen3.

Make this explicit by using pm_runtime_force_{suspend,resume}() as the
system sleep callbacks instead.  Use SET_LATE_SYSTEM_SLEEP_PM_OPS() as
DMA engines must be initialized before all DMA slave devices.

Fixes: 17218e0092f8 "PM / genpd: Stop/start devices without pm_runtime_force_suspend/resume()"
Suggested-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/dma/sh/rcar-dmac.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c
index 2b2c7db3e480..35c3936edc45 100644
--- a/drivers/dma/sh/rcar-dmac.c
+++ b/drivers/dma/sh/rcar-dmac.c
@@ -1615,22 +1615,6 @@ static struct dma_chan *rcar_dmac_of_xlate(struct of_phandle_args *dma_spec,
  * Power management
  */
 
-#ifdef CONFIG_PM_SLEEP
-static int rcar_dmac_sleep_suspend(struct device *dev)
-{
-	/*
-	 * TODO: Wait for the current transfer to complete and stop the device.
-	 */
-	return 0;
-}
-
-static int rcar_dmac_sleep_resume(struct device *dev)
-{
-	/* TODO: Resume transfers, if any. */
-	return 0;
-}
-#endif
-
 #ifdef CONFIG_PM
 static int rcar_dmac_runtime_suspend(struct device *dev)
 {
@@ -1646,7 +1630,13 @@ static int rcar_dmac_runtime_resume(struct device *dev)
 #endif
 
 static const struct dev_pm_ops rcar_dmac_pm = {
-	SET_SYSTEM_SLEEP_PM_OPS(rcar_dmac_sleep_suspend, rcar_dmac_sleep_resume)
+	/*
+	 * TODO for system sleep/resume:
+	 *   - Wait for the current transfer to complete and stop the device,
+	 *   - Resume transfers, if any.
+	 */
+	SET_LATE_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+				     pm_runtime_force_resume)
 	SET_RUNTIME_PM_OPS(rcar_dmac_runtime_suspend, rcar_dmac_runtime_resume,
 			   NULL)
 };

From 01857cf7748aef41c20987526d4c12f12b2f04ff Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@kernel.org>
Date: Wed, 17 Jan 2018 10:30:34 +0000
Subject: [PATCH 86/89] powercap: intel_rapl: Fix trailing semicolon

The trailing semicolon is an empty statement that does no operation.
Removing it since it doesn't do anything.

Signed-off-by: Luis de Bethencourt <luisbg@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 0188cff98cdd..35636e1d8a3d 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -1211,7 +1211,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 	struct rapl_domain *rd;
 	char dev_name[17]; /* max domain name = 7 + 1 + 8 for int + 1 for null*/
 	struct powercap_zone *power_zone = NULL;
-	int nr_pl, ret;;
+	int nr_pl, ret;
 
 	/* Update the domain data of the new package */
 	rapl_update_domain_data(rp);

From dff4113d5e3753c23e8b5bb6818f5829ccd0d06a Mon Sep 17 00:00:00 2001
From: Sudeep Holla <Sudeep.Holla@arm.com>
Date: Wed, 10 Jan 2018 16:44:14 +0000
Subject: [PATCH 87/89] drivers: psci: remove cluster terminology and
 dependency on physical_package_id

Since the definition of the term "cluster" is not well defined in the
architecture, we should avoid using it. Also the physical package id
is currently mapped to so called "clusters" in ARM/ARM64 platforms which
is already argumentative.

Currently PSCI checker uses the physical package id assuming that CPU
power domains map to "clusters" and the physical package id in the code
as it stands also maps to cluster boundaries. It does that trying to
test "cluster" idle states to its best. However the CPU power domain
often but not always maps directly to the processor topology.

This patch removes the dependency on physical_package_id from the topology
in this PSCI checker. Also it replaces all the occurences of clusters to
cpu_groups which is derived from core_sibling_mask and may not directly
map to physical "cluster".

Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/firmware/psci_checker.c | 46 ++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/drivers/firmware/psci_checker.c b/drivers/firmware/psci_checker.c
index f3f4f810e5df..bb1c068bff19 100644
--- a/drivers/firmware/psci_checker.c
+++ b/drivers/firmware/psci_checker.c
@@ -77,8 +77,8 @@ static int psci_ops_check(void)
 	return 0;
 }
 
-static int find_clusters(const struct cpumask *cpus,
-			 const struct cpumask **clusters)
+static int find_cpu_groups(const struct cpumask *cpus,
+			   const struct cpumask **cpu_groups)
 {
 	unsigned int nb = 0;
 	cpumask_var_t tmp;
@@ -88,11 +88,11 @@ static int find_clusters(const struct cpumask *cpus,
 	cpumask_copy(tmp, cpus);
 
 	while (!cpumask_empty(tmp)) {
-		const struct cpumask *cluster =
+		const struct cpumask *cpu_group =
 			topology_core_cpumask(cpumask_any(tmp));
 
-		clusters[nb++] = cluster;
-		cpumask_andnot(tmp, tmp, cluster);
+		cpu_groups[nb++] = cpu_group;
+		cpumask_andnot(tmp, tmp, cpu_group);
 	}
 
 	free_cpumask_var(tmp);
@@ -170,24 +170,24 @@ static int hotplug_tests(void)
 {
 	int err;
 	cpumask_var_t offlined_cpus;
-	int i, nb_cluster;
-	const struct cpumask **clusters;
+	int i, nb_cpu_group;
+	const struct cpumask **cpu_groups;
 	char *page_buf;
 
 	err = -ENOMEM;
 	if (!alloc_cpumask_var(&offlined_cpus, GFP_KERNEL))
 		return err;
-	/* We may have up to nb_available_cpus clusters. */
-	clusters = kmalloc_array(nb_available_cpus, sizeof(*clusters),
-				 GFP_KERNEL);
-	if (!clusters)
+	/* We may have up to nb_available_cpus cpu_groups. */
+	cpu_groups = kmalloc_array(nb_available_cpus, sizeof(*cpu_groups),
+				   GFP_KERNEL);
+	if (!cpu_groups)
 		goto out_free_cpus;
 	page_buf = (char *)__get_free_page(GFP_KERNEL);
 	if (!page_buf)
-		goto out_free_clusters;
+		goto out_free_cpu_groups;
 
 	err = 0;
-	nb_cluster = find_clusters(cpu_online_mask, clusters);
+	nb_cpu_group = find_cpu_groups(cpu_online_mask, cpu_groups);
 
 	/*
 	 * Of course the last CPU cannot be powered down and cpu_down() should
@@ -197,24 +197,22 @@ static int hotplug_tests(void)
 	err += down_and_up_cpus(cpu_online_mask, offlined_cpus);
 
 	/*
-	 * Take down CPUs by cluster this time. When the last CPU is turned
-	 * off, the cluster itself should shut down.
+	 * Take down CPUs by cpu group this time. When the last CPU is turned
+	 * off, the cpu group itself should shut down.
 	 */
-	for (i = 0; i < nb_cluster; ++i) {
-		int cluster_id =
-			topology_physical_package_id(cpumask_any(clusters[i]));
+	for (i = 0; i < nb_cpu_group; ++i) {
 		ssize_t len = cpumap_print_to_pagebuf(true, page_buf,
-						      clusters[i]);
+						      cpu_groups[i]);
 		/* Remove trailing newline. */
 		page_buf[len - 1] = '\0';
-		pr_info("Trying to turn off and on again cluster %d "
-			"(CPUs %s)\n", cluster_id, page_buf);
-		err += down_and_up_cpus(clusters[i], offlined_cpus);
+		pr_info("Trying to turn off and on again group %d (CPUs %s)\n",
+			i, page_buf);
+		err += down_and_up_cpus(cpu_groups[i], offlined_cpus);
 	}
 
 	free_page((unsigned long)page_buf);
-out_free_clusters:
-	kfree(clusters);
+out_free_cpu_groups:
+	kfree(cpu_groups);
 out_free_cpus:
 	free_cpumask_var(offlined_cpus);
 	return err;

From 343a8d17fa8d6dd97f408e8fedbcef12073f3774 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <Sudeep.Holla@arm.com>
Date: Wed, 10 Jan 2018 16:44:15 +0000
Subject: [PATCH 88/89] cpufreq: scpi: remove arm_big_little dependency

The dependency on physical_package_id from the topology to get the
cluster identifier is wrong. The concept of cluster used in ARM topology
is unfortunately not well defined in the architecture, we should avoid
using it. Further the frequency domain need not be mapped to so called
"clusters" one to one.

SCPI already provides means to obtain the frequency domain id from the
device tree. In order to support some new topologies(e.g. DSU which
contains 2 frequency domains within the physical cluster), pseudo
clusters are created to make this driver work which is wrong again.

In order to solve those issues and also remove dependency of topological
physical id for frequency domain, this patch removes the arm_big_little
dependency from scpi driver.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/scpi-cpufreq.c | 193 ++++++++++++++++++++++++++++++---
 1 file changed, 178 insertions(+), 15 deletions(-)

diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c
index 05d299052c5c..247fcbfa4cb5 100644
--- a/drivers/cpufreq/scpi-cpufreq.c
+++ b/drivers/cpufreq/scpi-cpufreq.c
@@ -18,27 +18,89 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/clk.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
+#include <linux/cpumask.h>
+#include <linux/cpu_cooling.h>
+#include <linux/export.h>
 #include <linux/module.h>
-#include <linux/platform_device.h>
+#include <linux/of_platform.h>
 #include <linux/pm_opp.h>
 #include <linux/scpi_protocol.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 
-#include "arm_big_little.h"
+struct scpi_data {
+	struct clk *clk;
+	struct device *cpu_dev;
+	struct thermal_cooling_device *cdev;
+};
 
 static struct scpi_ops *scpi_ops;
 
-static int scpi_get_transition_latency(struct device *cpu_dev)
+static unsigned int scpi_cpufreq_get_rate(unsigned int cpu)
 {
-	return scpi_ops->get_transition_latency(cpu_dev);
+	struct cpufreq_policy *policy = cpufreq_cpu_get_raw(cpu);
+	struct scpi_data *priv = policy->driver_data;
+	unsigned long rate = clk_get_rate(priv->clk);
+
+	return rate / 1000;
 }
 
-static int scpi_init_opp_table(const struct cpumask *cpumask)
+static int
+scpi_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int index)
+{
+	struct scpi_data *priv = policy->driver_data;
+	u64 rate = policy->freq_table[index].frequency * 1000;
+	int ret;
+
+	ret = clk_set_rate(priv->clk, rate);
+	if (!ret && (clk_get_rate(priv->clk) != rate))
+		ret = -EIO;
+
+	return ret;
+}
+
+static int
+scpi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
+{
+	int cpu, domain, tdomain;
+	struct device *tcpu_dev;
+
+	domain = scpi_ops->device_domain_id(cpu_dev);
+	if (domain < 0)
+		return domain;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu == cpu_dev->id)
+			continue;
+
+		tcpu_dev = get_cpu_device(cpu);
+		if (!tcpu_dev)
+			continue;
+
+		tdomain = scpi_ops->device_domain_id(tcpu_dev);
+		if (tdomain == domain)
+			cpumask_set_cpu(cpu, cpumask);
+	}
+
+	return 0;
+}
+
+static int scpi_cpufreq_init(struct cpufreq_policy *policy)
 {
 	int ret;
-	struct device *cpu_dev = get_cpu_device(cpumask_first(cpumask));
+	unsigned int latency;
+	struct device *cpu_dev;
+	struct scpi_data *priv;
+	struct cpufreq_frequency_table *freq_table;
+
+	cpu_dev = get_cpu_device(policy->cpu);
+	if (!cpu_dev) {
+		pr_err("failed to get cpu%d device\n", policy->cpu);
+		return -ENODEV;
+	}
 
 	ret = scpi_ops->add_opps_to_device(cpu_dev);
 	if (ret) {
@@ -46,32 +108,133 @@ static int scpi_init_opp_table(const struct cpumask *cpumask)
 		return ret;
 	}
 
-	ret = dev_pm_opp_set_sharing_cpus(cpu_dev, cpumask);
-	if (ret)
+	ret = scpi_get_sharing_cpus(cpu_dev, policy->cpus);
+	if (ret) {
+		dev_warn(cpu_dev, "failed to get sharing cpumask\n");
+		return ret;
+	}
+
+	ret = dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus);
+	if (ret) {
 		dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
 			__func__, ret);
+		return ret;
+	}
+
+	ret = dev_pm_opp_get_opp_count(cpu_dev);
+	if (ret <= 0) {
+		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
+		ret = -EPROBE_DEFER;
+		goto out_free_opp;
+	}
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		goto out_free_opp;
+	}
+
+	ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
+	if (ret) {
+		dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret);
+		goto out_free_priv;
+	}
+
+	priv->cpu_dev = cpu_dev;
+	priv->clk = clk_get(cpu_dev, NULL);
+	if (IS_ERR(priv->clk)) {
+		dev_err(cpu_dev, "%s: Failed to get clk for cpu: %d\n",
+			__func__, cpu_dev->id);
+		goto out_free_cpufreq_table;
+	}
+
+	policy->driver_data = priv;
+
+	ret = cpufreq_table_validate_and_show(policy, freq_table);
+	if (ret) {
+		dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__,
+			ret);
+		goto out_put_clk;
+	}
+
+	/* scpi allows DVFS request for any domain from any CPU */
+	policy->dvfs_possible_from_any_cpu = true;
+
+	latency = scpi_ops->get_transition_latency(cpu_dev);
+	if (!latency)
+		latency = CPUFREQ_ETERNAL;
+
+	policy->cpuinfo.transition_latency = latency;
+
+	policy->fast_switch_possible = false;
+	return 0;
+
+out_put_clk:
+	clk_put(priv->clk);
+out_free_cpufreq_table:
+	dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table);
+out_free_priv:
+	kfree(priv);
+out_free_opp:
+	dev_pm_opp_cpumask_remove_table(policy->cpus);
+
 	return ret;
 }
 
-static const struct cpufreq_arm_bL_ops scpi_cpufreq_ops = {
-	.name	= "scpi",
-	.get_transition_latency = scpi_get_transition_latency,
-	.init_opp_table = scpi_init_opp_table,
-	.free_opp_table = dev_pm_opp_cpumask_remove_table,
+static int scpi_cpufreq_exit(struct cpufreq_policy *policy)
+{
+	struct scpi_data *priv = policy->driver_data;
+
+	cpufreq_cooling_unregister(priv->cdev);
+	clk_put(priv->clk);
+	dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
+	kfree(priv);
+	dev_pm_opp_cpumask_remove_table(policy->related_cpus);
+
+	return 0;
+}
+
+static void scpi_cpufreq_ready(struct cpufreq_policy *policy)
+{
+	struct scpi_data *priv = policy->driver_data;
+	struct thermal_cooling_device *cdev;
+
+	cdev = of_cpufreq_cooling_register(policy);
+	if (!IS_ERR(cdev))
+		priv->cdev = cdev;
+}
+
+static struct cpufreq_driver scpi_cpufreq_driver = {
+	.name	= "scpi-cpufreq",
+	.flags	= CPUFREQ_STICKY | CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
+		  CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.verify	= cpufreq_generic_frequency_table_verify,
+	.attr	= cpufreq_generic_attr,
+	.get	= scpi_cpufreq_get_rate,
+	.init	= scpi_cpufreq_init,
+	.exit	= scpi_cpufreq_exit,
+	.ready	= scpi_cpufreq_ready,
+	.target_index	= scpi_cpufreq_set_target,
 };
 
 static int scpi_cpufreq_probe(struct platform_device *pdev)
 {
+	int ret;
+
 	scpi_ops = get_scpi_ops();
 	if (!scpi_ops)
 		return -EIO;
 
-	return bL_cpufreq_register(&scpi_cpufreq_ops);
+	ret = cpufreq_register_driver(&scpi_cpufreq_driver);
+	if (ret)
+		dev_err(&pdev->dev, "%s: registering cpufreq failed, err: %d\n",
+			__func__, ret);
+	return ret;
 }
 
 static int scpi_cpufreq_remove(struct platform_device *pdev)
 {
-	bL_cpufreq_unregister(&scpi_cpufreq_ops);
+	cpufreq_unregister_driver(&scpi_cpufreq_driver);
 	scpi_ops = NULL;
 	return 0;
 }

From bee344cb70e9bf5ad929e0a493c0f7aa3a587bfb Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@kernel.org>
Date: Wed, 17 Jan 2018 10:33:21 +0000
Subject: [PATCH 89/89] PCI / PM: Remove spurious semicolon

The trailing semicolon is an empty statement that does no operation.
Removing it since it doesn't do anything.

Signed-off-by: Luis de Bethencourt <luisbg@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pci/pci-driver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 14fd865a5120..765890e77cd5 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -953,7 +953,7 @@ static int pci_pm_freeze_late(struct device *dev)
 	if (dev_pm_smart_suspend_and_suspended(dev))
 		return 0;
 
-	return pm_generic_freeze_late(dev);;
+	return pm_generic_freeze_late(dev);
 }
 
 static int pci_pm_freeze_noirq(struct device *dev)