From 234f223d63d8f7db64a682ccf02871d40d38db52 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 7 Jun 2019 00:30:58 +0200
Subject: [PATCH 1/8] PCI: PM: Avoid resuming devices in D3hot during system
 suspend

The current code resumes devices in D3hot during system suspend if
the target power state for them is D3cold, but that is not necessary
in general.  It only is necessary to do that if the platform firmware
requires the device to be resumed, but that should be covered by
the platform_pci_need_resume() check anyway, so rework
pci_dev_keep_suspended() to avoid returning 'false' for devices
in D3hot which need not be resumed due to platform firmware
requirements.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/pci/pci.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8abc843b1615..324175509e94 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2474,10 +2474,20 @@ bool pci_dev_keep_suspended(struct pci_dev *pci_dev)
 {
 	struct device *dev = &pci_dev->dev;
 	bool wakeup = device_may_wakeup(dev);
+	pci_power_t target_state;
 
-	if (!pm_runtime_suspended(dev)
-	    || pci_target_state(pci_dev, wakeup) != pci_dev->current_state
-	    || platform_pci_need_resume(pci_dev))
+	if (!pm_runtime_suspended(dev) || platform_pci_need_resume(pci_dev))
+		return false;
+
+	target_state = pci_target_state(pci_dev, wakeup);
+
+	/*
+	 * If the earlier platform check has not triggered, D3cold is just power
+	 * removal on top of D3hot, so no need to resume the device in that
+	 * case.
+	 */
+	if (target_state != pci_dev->current_state &&
+	    target_state != PCI_D3cold && pci_dev->current_state != PCI_D3hot)
 		return false;
 
 	/*

From 0c7376ada9508141becec9b897d73b65ce66a15a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 7 Jun 2019 00:32:31 +0200
Subject: [PATCH 2/8] PCI: PM: Replace pci_dev_keep_suspended() with two
 functions

The code in pci_dev_keep_suspended() is relatively hard to follow due
to the negative checks in it and in its callers and the function has
a possible side-effect (disabling the PME) which doesn't really match
its role.

For this reason, move the PME disabling from pci_dev_keep_suspended()
to a separate function and change the semantics (and name) of the
rest of it, so that 'true' is returned when the device needs to be
resumed (and not the other way around).  Change the callers of
pci_dev_keep_suspended() accordingly.

While at it, make the code flow in pci_pm_poweroff() reflect the
pci_pm_suspend() more closely to avoid arbitrary differences between
them.

This is a cosmetic change with no intention to alter behavior.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/pci/pci-driver.c | 22 +++++++++++++---
 drivers/pci/pci.c        | 55 ++++++++++++++++++++--------------------
 drivers/pci/pci.h        |  3 ++-
 3 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 5eadbc3d0969..dd656cf19a12 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -679,6 +679,7 @@ static bool pci_has_legacy_pm_support(struct pci_dev *pci_dev)
 static int pci_pm_prepare(struct device *dev)
 {
 	struct device_driver *drv = dev->driver;
+	struct pci_dev *pci_dev = to_pci_dev(dev);
 
 	if (drv && drv->pm && drv->pm->prepare) {
 		int error = drv->pm->prepare(dev);
@@ -688,7 +689,15 @@ static int pci_pm_prepare(struct device *dev)
 		if (!error && dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_PREPARE))
 			return 0;
 	}
-	return pci_dev_keep_suspended(to_pci_dev(dev));
+	if (pci_dev_need_resume(pci_dev))
+		return 0;
+
+	/*
+	 * The PME setting needs to be adjusted here in case the direct-complete
+	 * optimization is used with respect to this device.
+	 */
+	pci_dev_adjust_pme(pci_dev);
+	return 1;
 }
 
 static void pci_pm_complete(struct device *dev)
@@ -758,9 +767,11 @@ static int pci_pm_suspend(struct device *dev)
 	 * better to resume the device from runtime suspend here.
 	 */
 	if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) ||
-	    !pci_dev_keep_suspended(pci_dev)) {
+	    pci_dev_need_resume(pci_dev)) {
 		pm_runtime_resume(dev);
 		pci_dev->state_saved = false;
+	} else {
+		pci_dev_adjust_pme(pci_dev);
 	}
 
 	if (pm->suspend) {
@@ -1108,10 +1119,13 @@ static int pci_pm_poweroff(struct device *dev)
 
 	/* The reason to do that is the same as in pci_pm_suspend(). */
 	if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) ||
-	    !pci_dev_keep_suspended(pci_dev))
+	    pci_dev_need_resume(pci_dev)) {
 		pm_runtime_resume(dev);
+		pci_dev->state_saved = false;
+	} else {
+		pci_dev_adjust_pme(pci_dev);
+	}
 
-	pci_dev->state_saved = false;
 	if (pm->poweroff) {
 		int error;
 
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 324175509e94..24d86bd1bba1 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2459,55 +2459,56 @@ bool pci_dev_run_wake(struct pci_dev *dev)
 EXPORT_SYMBOL_GPL(pci_dev_run_wake);
 
 /**
- * pci_dev_keep_suspended - Check if the device can stay in the suspended state.
+ * pci_dev_need_resume - Check if it is necessary to resume the device.
  * @pci_dev: Device to check.
  *
- * Return 'true' if the device is runtime-suspended, it doesn't have to be
+ * Return 'true' if the device is not runtime-suspended or it has to be
  * reconfigured due to wakeup settings difference between system and runtime
- * suspend and the current power state of it is suitable for the upcoming
- * (system) transition.
- *
- * If the device is not configured for system wakeup, disable PME for it before
- * returning 'true' to prevent it from waking up the system unnecessarily.
+ * suspend, or the current power state of it is not suitable for the upcoming
+ * (system-wide) transition.
  */
-bool pci_dev_keep_suspended(struct pci_dev *pci_dev)
+bool pci_dev_need_resume(struct pci_dev *pci_dev)
 {
 	struct device *dev = &pci_dev->dev;
-	bool wakeup = device_may_wakeup(dev);
 	pci_power_t target_state;
 
 	if (!pm_runtime_suspended(dev) || platform_pci_need_resume(pci_dev))
-		return false;
+		return true;
 
-	target_state = pci_target_state(pci_dev, wakeup);
+	target_state = pci_target_state(pci_dev, device_may_wakeup(dev));
 
 	/*
 	 * If the earlier platform check has not triggered, D3cold is just power
 	 * removal on top of D3hot, so no need to resume the device in that
 	 * case.
 	 */
-	if (target_state != pci_dev->current_state &&
-	    target_state != PCI_D3cold && pci_dev->current_state != PCI_D3hot)
-		return false;
+	return target_state != pci_dev->current_state &&
+		target_state != PCI_D3cold &&
+		pci_dev->current_state != PCI_D3hot;
+}
+
+/**
+ * pci_dev_adjust_pme - Adjust PME setting for a suspended device.
+ * @pci_dev: Device to check.
+ *
+ * If the device is suspended and it is not configured for system wakeup,
+ * disable PME for it to prevent it from waking up the system unnecessarily.
+ *
+ * Note that if the device's power state is D3cold and the platform check in
+ * pci_dev_need_resume() has not triggered, the device's configuration need not
+ * be changed.
+ */
+void pci_dev_adjust_pme(struct pci_dev *pci_dev)
+{
+	struct device *dev = &pci_dev->dev;
 
-	/*
-	 * At this point the device is good to go unless it's been configured
-	 * to generate PME at the runtime suspend time, but it is not supposed
-	 * to wake up the system.  In that case, simply disable PME for it
-	 * (it will have to be re-enabled on exit from system resume).
-	 *
-	 * If the device's power state is D3cold and the platform check above
-	 * hasn't triggered, the device's configuration is suitable and we don't
-	 * need to manipulate it at all.
-	 */
 	spin_lock_irq(&dev->power.lock);
 
-	if (pm_runtime_suspended(dev) && pci_dev->current_state < PCI_D3cold &&
-	    !wakeup)
+	if (pm_runtime_suspended(dev) && !device_may_wakeup(dev) &&
+	    pci_dev->current_state < PCI_D3cold)
 		__pci_pme_active(pci_dev, false);
 
 	spin_unlock_irq(&dev->power.lock);
-	return true;
 }
 
 /**
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9cb99380c61e..e04fa7fd3e2f 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -82,7 +82,8 @@ int pci_finish_runtime_suspend(struct pci_dev *dev);
 void pcie_clear_root_pme_status(struct pci_dev *dev);
 int __pci_pme_wakeup(struct pci_dev *dev, void *ign);
 void pci_pme_restore(struct pci_dev *dev);
-bool pci_dev_keep_suspended(struct pci_dev *dev);
+bool pci_dev_need_resume(struct pci_dev *dev);
+void pci_dev_adjust_pme(struct pci_dev *dev);
 void pci_dev_complete_resume(struct pci_dev *pci_dev);
 void pci_config_pm_runtime_get(struct pci_dev *dev);
 void pci_config_pm_runtime_put(struct pci_dev *dev);

From c2bf1fc212f7e6f25ace1af8f0b3ac061ea48ba5 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 12 Jun 2019 13:57:38 +0300
Subject: [PATCH 3/8] PCI: Add missing link delays required by the PCIe spec

Currently Linux does not follow PCIe spec regarding the required delays
after reset. A concrete example is a Thunderbolt add-in-card that
consists of a PCIe switch and two PCIe endpoints:

  +-1b.0-[01-6b]----00.0-[02-6b]--+-00.0-[03]----00.0 TBT controller
                                  +-01.0-[04-36]-- DS hotplug port
                                  +-02.0-[37]----00.0 xHCI controller
                                  \-04.0-[38-6b]-- DS hotplug port

The root port (1b.0) and the PCIe switch downstream ports are all PCIe
gen3 so they support 8GT/s link speeds.

We wait for the PCIe hierarchy to enter D3cold (runtime):

  pcieport 0000:00:1b.0: power state changed by ACPI to D3cold

When it wakes up from D3cold, according to the PCIe 4.0 section 5.8 the
PCIe switch is put to reset and its power is re-applied. This means that
we must follow the rules in PCIe 4.0 section 6.6.1.

For the PCIe gen3 ports we are dealing with here, the following applies:

  With a Downstream Port that supports Link speeds greater than 5.0
  GT/s, software must wait a minimum of 100 ms after Link training
  completes before sending a Configuration Request to the device
  immediately below that Port. Software can determine when Link training
  completes by polling the Data Link Layer Link Active bit or by setting
  up an associated interrupt (see Section 6.7.3.3).

Translating this into the above topology we would need to do this (DLLLA
stands for Data Link Layer Link Active):

  pcieport 0000:00:1b.0: wait for 100ms after DLLLA is set before access to 0000:01:00.0
  pcieport 0000:02:00.0: wait for 100ms after DLLLA is set before access to 0000:03:00.0
  pcieport 0000:02:02.0: wait for 100ms after DLLLA is set before access to 0000:37:00.0

I've instrumented the kernel with additional logging so we can see the
actual delays the kernel performs:

  pcieport 0000:00:1b.0: power state changed by ACPI to D0
  pcieport 0000:00:1b.0: waiting for D3cold delay of 100 ms
  pcieport 0000:00:1b.0: waking up bus
  pcieport 0000:00:1b.0: waiting for D3hot delay of 10 ms
  pcieport 0000:00:1b.0: restoring config space at offset 0x2c (was 0x60, writing 0x60)
  ...
  pcieport 0000:00:1b.0: PME# disabled
  pcieport 0000:01:00.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff)
  ...
  pcieport 0000:01:00.0: PME# disabled
  pcieport 0000:02:00.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff)
  ...
  pcieport 0000:02:00.0: PME# disabled
  pcieport 0000:02:01.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff)
  ...
  pcieport 0000:02:01.0: restoring config space at offset 0x4 (was 0x100000, writing 0x100407)
  pcieport 0000:02:01.0: PME# disabled
  pcieport 0000:02:02.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff)
  ...
  pcieport 0000:02:02.0: PME# disabled
  pcieport 0000:02:04.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff)
  ...
  pcieport 0000:02:04.0: PME# disabled
  pcieport 0000:02:01.0: PME# enabled
  pcieport 0000:02:01.0: waiting for D3hot delay of 10 ms
  pcieport 0000:02:04.0: PME# enabled
  pcieport 0000:02:04.0: waiting for D3hot delay of 10 ms
  thunderbolt 0000:03:00.0: restoring config space at offset 0x14 (was 0x0, writing 0x8a040000)
  ...
  thunderbolt 0000:03:00.0: PME# disabled
  xhci_hcd 0000:37:00.0: restoring config space at offset 0x10 (was 0x0, writing 0x73f00000)
  ...
  xhci_hcd 0000:37:00.0: PME# disabled

For the switch upstream port (01:00.0) we wait for 100ms but not taking
into account the DLLLA requirement. We then wait 10ms for D3hot -> D0
transition of the root port and the two downstream hotplug ports. This
means that we deviate from what the spec requires.

Performing the same check for system sleep (s2idle) transitions we can
see following when resuming from s2idle:

  pcieport 0000:00:1b.0: power state changed by ACPI to D0
  pcieport 0000:00:1b.0: restoring config space at offset 0x2c (was 0x60, writing 0x60)
  ...
  pcieport 0000:01:00.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff)
  ...
  pcieport 0000:02:02.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff)
  pcieport 0000:02:02.0: restoring config space at offset 0x2c (was 0x0, writing 0x0)
  pcieport 0000:02:01.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff)
  pcieport 0000:02:04.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff)
  pcieport 0000:02:02.0: restoring config space at offset 0x28 (was 0x0, writing 0x0)
  pcieport 0000:02:00.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff)
  pcieport 0000:02:02.0: restoring config space at offset 0x24 (was 0x10001, writing 0x1fff1)
  pcieport 0000:02:01.0: restoring config space at offset 0x2c (was 0x0, writing 0x60)
  pcieport 0000:02:02.0: restoring config space at offset 0x20 (was 0x0, writing 0x73f073f0)
  pcieport 0000:02:04.0: restoring config space at offset 0x2c (was 0x0, writing 0x60)
  pcieport 0000:02:01.0: restoring config space at offset 0x28 (was 0x0, writing 0x60)
  pcieport 0000:02:00.0: restoring config space at offset 0x2c (was 0x0, writing 0x0)
  pcieport 0000:02:02.0: restoring config space at offset 0x1c (was 0x101, writing 0x1f1)
  pcieport 0000:02:04.0: restoring config space at offset 0x28 (was 0x0, writing 0x60)
  pcieport 0000:02:01.0: restoring config space at offset 0x24 (was 0x10001, writing 0x1ff10001)
  pcieport 0000:02:00.0: restoring config space at offset 0x28 (was 0x0, writing 0x0)
  pcieport 0000:02:02.0: restoring config space at offset 0x18 (was 0x0, writing 0x373702)
  pcieport 0000:02:04.0: restoring config space at offset 0x24 (was 0x10001, writing 0x49f12001)
  pcieport 0000:02:01.0: restoring config space at offset 0x20 (was 0x0, writing 0x73e05c00)
  pcieport 0000:02:00.0: restoring config space at offset 0x24 (was 0x10001, writing 0x1fff1)
  pcieport 0000:02:04.0: restoring config space at offset 0x20 (was 0x0, writing 0x89f07400)
  pcieport 0000:02:01.0: restoring config space at offset 0x1c (was 0x101, writing 0x5151)
  pcieport 0000:02:00.0: restoring config space at offset 0x20 (was 0x0, writing 0x8a008a00)
  pcieport 0000:02:02.0: restoring config space at offset 0xc (was 0x10000, writing 0x10020)
  pcieport 0000:02:04.0: restoring config space at offset 0x1c (was 0x101, writing 0x6161)
  pcieport 0000:02:01.0: restoring config space at offset 0x18 (was 0x0, writing 0x360402)
  pcieport 0000:02:00.0: restoring config space at offset 0x1c (was 0x101, writing 0x1f1)
  pcieport 0000:02:04.0: restoring config space at offset 0x18 (was 0x0, writing 0x6b3802)
  pcieport 0000:02:02.0: restoring config space at offset 0x4 (was 0x100000, writing 0x100407)
  pcieport 0000:02:00.0: restoring config space at offset 0x18 (was 0x0, writing 0x30302)
  pcieport 0000:02:01.0: restoring config space at offset 0xc (was 0x10000, writing 0x10020)
  pcieport 0000:02:04.0: restoring config space at offset 0xc (was 0x10000, writing 0x10020)
  pcieport 0000:02:00.0: restoring config space at offset 0xc (was 0x10000, writing 0x10020)
  pcieport 0000:02:01.0: restoring config space at offset 0x4 (was 0x100000, writing 0x100407)
  pcieport 0000:02:04.0: restoring config space at offset 0x4 (was 0x100000, writing 0x100407)
  pcieport 0000:02:00.0: restoring config space at offset 0x4 (was 0x100000, writing 0x100407)
  xhci_hcd 0000:37:00.0: restoring config space at offset 0x10 (was 0x0, writing 0x73f00000)
  ...
  thunderbolt 0000:03:00.0: restoring config space at offset 0x14 (was 0x0, writing 0x8a040000)

This is even worse. None of the mandatory delays are performed. If this
would be S3 instead of s2idle then according to PCI FW spec 3.2 section
4.6.8.  there is a specific _DSM that allows the OS to skip the delays
but this platform does not provide the _DSM and does not go to S3 anyway
so no firmware is involved that could already handle these delays.

In this particular Intel Coffee Lake platform these delays are not
actually needed because there is an additional delay as part of the ACPI
power resource that is used to turn on power to the hierarchy but since
that additional delay is not required by any of standards (PCIe, ACPI)
it is not present in the Intel Ice Lake, for example where missing the
mandatory delays causes pciehp to start tearing down the stack too early
(links are not yet trained).

For this reason, change the PCIe portdrv PM resume hooks so that they
perform the mandatory delays before the downstream component gets
resumed. We perform the delays before port services are resumed because
otherwise pciehp might find that the link is not up (even if it is just
training) and tears-down the hierarchy.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pci/pci.c               | 29 ++++++++++-----
 drivers/pci/pci.h               |  1 +
 drivers/pci/pcie/portdrv_core.c | 66 +++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 24d86bd1bba1..9839d6f9bcb5 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1004,15 +1004,10 @@ static void __pci_start_power_transition(struct pci_dev *dev, pci_power_t state)
 	if (state == PCI_D0) {
 		pci_platform_power_transition(dev, PCI_D0);
 		/*
-		 * Mandatory power management transition delays, see
-		 * PCI Express Base Specification Revision 2.0 Section
-		 * 6.6.1: Conventional Reset.  Do not delay for
-		 * devices powered on/off by corresponding bridge,
-		 * because have already delayed for the bridge.
+		 * Mandatory power management transition delays are
+		 * handled in the PCIe portdrv resume hooks.
 		 */
 		if (dev->runtime_d3cold) {
-			if (dev->d3cold_delay && !dev->imm_ready)
-				msleep(dev->d3cold_delay);
 			/*
 			 * When powering on a bridge from D3cold, the
 			 * whole hierarchy may be powered on into
@@ -4579,14 +4574,16 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
 
 	return pci_dev_wait(dev, "PM D3->D0", PCIE_RESET_READY_POLL_MS);
 }
+
 /**
- * pcie_wait_for_link - Wait until link is active or inactive
+ * pcie_wait_for_link_delay - Wait until link is active or inactive
  * @pdev: Bridge device
  * @active: waiting for active or inactive?
+ * @delay: Delay to wait after link has become active (in ms)
  *
  * Use this to wait till link becomes active or inactive.
  */
-bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+bool pcie_wait_for_link_delay(struct pci_dev *pdev, bool active, int delay)
 {
 	int timeout = 1000;
 	bool ret;
@@ -4623,13 +4620,25 @@ bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
 		timeout -= 10;
 	}
 	if (active && ret)
-		msleep(100);
+		msleep(delay);
 	else if (ret != active)
 		pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
 			active ? "set" : "cleared");
 	return ret == active;
 }
 
+/**
+ * pcie_wait_for_link - Wait until link is active or inactive
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+	return pcie_wait_for_link_delay(pdev, active, 100);
+}
+
 void pci_reset_secondary_bus(struct pci_dev *dev)
 {
 	u16 ctrl;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index e04fa7fd3e2f..7f9fd93270ed 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -494,6 +494,7 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev)
 void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
 		      u32 service);
 
+bool pcie_wait_for_link_delay(struct pci_dev *pdev, bool active, int delay);
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 1b330129089f..308c3e0c4a34 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/kernel.h>
+#include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/pm.h>
 #include <linux/pm_runtime.h>
@@ -378,6 +379,67 @@ static int pm_iter(struct device *dev, void *data)
 	return 0;
 }
 
+static int get_downstream_delay(struct pci_bus *bus)
+{
+	struct pci_dev *pdev;
+	int min_delay = 100;
+	int max_delay = 0;
+
+	list_for_each_entry(pdev, &bus->devices, bus_list) {
+		if (!pdev->imm_ready)
+			min_delay = 0;
+		else if (pdev->d3cold_delay < min_delay)
+			min_delay = pdev->d3cold_delay;
+		if (pdev->d3cold_delay > max_delay)
+			max_delay = pdev->d3cold_delay;
+	}
+
+	return max(min_delay, max_delay);
+}
+
+/*
+ * wait_for_downstream_link - Wait for downstream link to establish
+ * @pdev: PCIe port whose downstream link is waited
+ *
+ * Handle delays according to PCIe 4.0 section 6.6.1 before configuration
+ * access to the downstream component is permitted.
+ *
+ * This blocks PCI core resume of the hierarchy below this port until the
+ * link is trained. Should be called before resuming port services to
+ * prevent pciehp from starting to tear-down the hierarchy too soon.
+ */
+static void wait_for_downstream_link(struct pci_dev *pdev)
+{
+	int delay;
+
+	if (pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT &&
+	    pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM)
+		return;
+
+	if (pci_dev_is_disconnected(pdev))
+		return;
+
+	if (!pdev->subordinate || list_empty(&pdev->subordinate->devices) ||
+	    !pdev->bridge_d3)
+		return;
+
+	delay = get_downstream_delay(pdev->subordinate);
+	if (!delay)
+		return;
+
+	dev_dbg(&pdev->dev, "waiting downstream link for %d ms\n", delay);
+
+	/*
+	 * If downstream port does not support speeds greater than 5 GT/s
+	 * need to wait 100ms. For higher speeds (gen3) we need to wait
+	 * first for the data link layer to become active.
+	 */
+	if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT)
+		msleep(delay);
+	else
+		pcie_wait_for_link_delay(pdev, true, delay);
+}
+
 /**
  * pcie_port_device_suspend - suspend port services associated with a PCIe port
  * @dev: PCI Express port to handle
@@ -391,6 +453,8 @@ int pcie_port_device_suspend(struct device *dev)
 int pcie_port_device_resume_noirq(struct device *dev)
 {
 	size_t off = offsetof(struct pcie_port_service_driver, resume_noirq);
+
+	wait_for_downstream_link(to_pci_dev(dev));
 	return device_for_each_child(dev, &off, pm_iter);
 }
 
@@ -421,6 +485,8 @@ int pcie_port_device_runtime_suspend(struct device *dev)
 int pcie_port_device_runtime_resume(struct device *dev)
 {
 	size_t off = offsetof(struct pcie_port_service_driver, runtime_resume);
+
+	wait_for_downstream_link(to_pci_dev(dev));
 	return device_for_each_child(dev, &off, pm_iter);
 }
 #endif /* PM */

From 000dd5316e1c756a1c028f22e01d06a38249dd4d Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 12 Jun 2019 13:57:39 +0300
Subject: [PATCH 4/8] PCI: Do not poll for PME if the device is in D3cold

PME polling does not take into account that a device that is directly
connected to the host bridge may go into D3cold as well. This leads to a
situation where the PME poll thread reads from a config space of a
device that is in D3cold and gets incorrect information because the
config space is not accessible.

Here is an example from Intel Ice Lake system where two PCIe root ports
are in D3cold (I've instrumented the kernel to log the PMCSR register
contents):

  [   62.971442] pcieport 0000:00:07.1: Check PME status, PMCSR=0xffff
  [   62.971504] pcieport 0000:00:07.0: Check PME status, PMCSR=0xffff

Since 0xffff is interpreted so that PME is pending, the root ports will
be runtime resumed. This repeats over and over again essentially
blocking all runtime power management.

Prevent this from happening by checking whether the device is in D3cold
before its PME status is read.

Fixes: 71a83bd727cc ("PCI/PM: add runtime PM support to PCIe port")
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Lukas Wunner <lukas@wunner.de>
Cc: 3.6+ <stable@vger.kernel.org> # v3.6+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pci/pci.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 9839d6f9bcb5..e34fb2b3c466 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2060,6 +2060,13 @@ static void pci_pme_list_scan(struct work_struct *work)
 			 */
 			if (bridge && bridge->current_state != PCI_D0)
 				continue;
+			/*
+			 * If the device is in D3cold it should not be
+			 * polled either.
+			 */
+			if (pme_dev->dev->current_state == PCI_D3cold)
+				continue;
+
 			pci_pme_wakeup(pme_dev->dev, NULL);
 		} else {
 			list_del(&pme_dev->list);

From 83a16e3f6d70da99896c7a2639c0b60fff13afb8 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 25 Jun 2019 13:29:40 +0300
Subject: [PATCH 5/8] PCI / ACPI: Use cached ACPI device state to get PCI
 device power state

The ACPI power state returned by acpi_device_get_power() may depend on
the configuration of ACPI power resources in the system which may change
any time after acpi_device_get_power() has returned, unless the
reference counters of the ACPI power resources in question are set to
prevent that from happening. Thus it is invalid to use acpi_device_get_power()
in acpi_pci_get_power_state() the way it is done now and the value of
the ->power.state field in the corresponding struct acpi_device objects
(which reflects the ACPI power resources reference counting, among other
things) should be used instead.

As an example where this becomes an issue is Intel Ice Lake where the
Thunderbolt controller (NHI), two PCIe root ports (RP0 and RP1) and xHCI
all share the same power resources. The following picture with power
resources marked with [] shows the topology:

  Host bridge
    |
    +- RP0 ---\
    +- RP1 ---|--+--> [TBT]
    +- NHI --/   |
    |            |
    |            v
    +- xHCI --> [D3C]

Here TBT and D3C are the shared ACPI power resources. ACPI _PR3() method
of the devices in question returns either TBT or D3C or both.

Say we runtime suspend first the root ports RP0 and RP1, then NHI. Now
since the TBT power resource is still on when the root ports are runtime
suspended their dev->current_state is set to D3hot. When NHI is runtime
suspended TBT is finally turned off but state of the root ports remain
to be D3hot. Now when the xHCI is runtime suspended D3C gets also turned
off. PCI core thus has power states of these devices cached in their
dev->current_state as follows:

  RP0 -> D3hot
  RP1 -> D3hot
  NHI -> D3cold
  xHCI -> D3cold

If the user now runs lspci for instance, the result is all 1's like in
the below output (00:07.0 is the first root port, RP0):

00:07.0 PCI bridge: Intel Corporation Device 8a1d (rev ff) (prog-if ff)
    !!! Unknown header type 7f
    Kernel driver in use: pcieport

In short the hardware state is not in sync with the software state
anymore. The exact same thing happens with the PME polling thread which
ends up bringing the root ports back into D0 after they are runtime
suspended.

For this reason, modify acpi_pci_get_power_state() so that it uses the
ACPI device power state that was cached by the ACPI core. This makes the
PCI device power state match the ACPI device power state regardless of
state of the shared power resources which may still be on at this point.

Link: https://lore.kernel.org/r/20190618161858.77834-2-mika.westerberg@linux.intel.com
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pci/pci-acpi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 1897847ceb0c..b782acac26c5 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -685,7 +685,8 @@ static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev)
 	if (!adev || !acpi_device_power_manageable(adev))
 		return PCI_UNKNOWN;
 
-	if (acpi_device_get_power(adev, &state) || state == ACPI_STATE_UNKNOWN)
+	state = adev->power.state;
+	if (state == ACPI_STATE_UNKNOWN)
 		return PCI_UNKNOWN;
 
 	return state_conv[state];

From 4533771c1e53b921f66e580135ee64a76986a491 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 25 Jun 2019 13:29:41 +0300
Subject: [PATCH 6/8] ACPI / PM: Introduce concept of a _PR0 dependent device

If there are shared power resources between otherwise unrelated devices
turning them on causes the other devices sharing them to be powered up
as well. In case of PCI devices go into D0uninitialized state meaning
that if they were configured to trigger wake that configuration is lost
at this point.

For this reason introduce a concept of "_PR0 dependent device" that can
be added to any ACPI device that has power resources. The dependent
device will be included in a list of dependent devices for all power
resources returned by the ACPI device's _PR0 (assuming it has one).
Whenever a power resource having dependent devices is turned physically
on (its _ON method is called) we runtime resume all of them to allow
their driver or in case of PCI the PCI core to re-initialize the device
and its wake configuration.

This adds two functions that can be used to add and remove these
dependent devices. Note the dependent device does not necessary need
share power resources so this functionality can be used to add "software
dependencies" as well if needed.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/power.c    | 135 ++++++++++++++++++++++++++++++++++++++++
 include/acpi/acpi_bus.h |   4 ++
 2 files changed, 139 insertions(+)

diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c
index a916417b9e70..fe1e7bc91a5e 100644
--- a/drivers/acpi/power.c
+++ b/drivers/acpi/power.c
@@ -42,6 +42,11 @@ ACPI_MODULE_NAME("power");
 #define ACPI_POWER_RESOURCE_STATE_ON	0x01
 #define ACPI_POWER_RESOURCE_STATE_UNKNOWN 0xFF
 
+struct acpi_power_dependent_device {
+	struct device *dev;
+	struct list_head node;
+};
+
 struct acpi_power_resource {
 	struct acpi_device device;
 	struct list_head list_node;
@@ -51,6 +56,7 @@ struct acpi_power_resource {
 	unsigned int ref_count;
 	bool wakeup_enabled;
 	struct mutex resource_lock;
+	struct list_head dependents;
 };
 
 struct acpi_power_resource_entry {
@@ -232,8 +238,121 @@ static int acpi_power_get_list_state(struct list_head *list, int *state)
 	return 0;
 }
 
+static int
+acpi_power_resource_add_dependent(struct acpi_power_resource *resource,
+				  struct device *dev)
+{
+	struct acpi_power_dependent_device *dep;
+	int ret = 0;
+
+	mutex_lock(&resource->resource_lock);
+	list_for_each_entry(dep, &resource->dependents, node) {
+		/* Only add it once */
+		if (dep->dev == dev)
+			goto unlock;
+	}
+
+	dep = kzalloc(sizeof(*dep), GFP_KERNEL);
+	if (!dep) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	dep->dev = dev;
+	list_add_tail(&dep->node, &resource->dependents);
+	dev_dbg(dev, "added power dependency to [%s]\n", resource->name);
+
+unlock:
+	mutex_unlock(&resource->resource_lock);
+	return ret;
+}
+
+static void
+acpi_power_resource_remove_dependent(struct acpi_power_resource *resource,
+				     struct device *dev)
+{
+	struct acpi_power_dependent_device *dep;
+
+	mutex_lock(&resource->resource_lock);
+	list_for_each_entry(dep, &resource->dependents, node) {
+		if (dep->dev == dev) {
+			list_del(&dep->node);
+			kfree(dep);
+			dev_dbg(dev, "removed power dependency to [%s]\n",
+				resource->name);
+			break;
+		}
+	}
+	mutex_unlock(&resource->resource_lock);
+}
+
+/**
+ * acpi_device_power_add_dependent - Add dependent device of this ACPI device
+ * @adev: ACPI device pointer
+ * @dev: Dependent device
+ *
+ * If @adev has non-empty _PR0 the @dev is added as dependent device to all
+ * power resources returned by it. This means that whenever these power
+ * resources are turned _ON the dependent devices get runtime resumed. This
+ * is needed for devices such as PCI to allow its driver to re-initialize
+ * it after it went to D0uninitialized.
+ *
+ * If @adev does not have _PR0 this does nothing.
+ *
+ * Returns %0 in case of success and negative errno otherwise.
+ */
+int acpi_device_power_add_dependent(struct acpi_device *adev,
+				    struct device *dev)
+{
+	struct acpi_power_resource_entry *entry;
+	struct list_head *resources;
+	int ret;
+
+	if (!adev->flags.power_manageable)
+		return 0;
+
+	resources = &adev->power.states[ACPI_STATE_D0].resources;
+	list_for_each_entry(entry, resources, node) {
+		ret = acpi_power_resource_add_dependent(entry->resource, dev);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	list_for_each_entry(entry, resources, node)
+		acpi_power_resource_remove_dependent(entry->resource, dev);
+
+	return ret;
+}
+
+/**
+ * acpi_device_power_remove_dependent - Remove dependent device
+ * @adev: ACPI device pointer
+ * @dev: Dependent device
+ *
+ * Does the opposite of acpi_device_power_add_dependent() and removes the
+ * dependent device if it is found. Can be called to @adev that does not
+ * have _PR0 as well.
+ */
+void acpi_device_power_remove_dependent(struct acpi_device *adev,
+					struct device *dev)
+{
+	struct acpi_power_resource_entry *entry;
+	struct list_head *resources;
+
+	if (!adev->flags.power_manageable)
+		return;
+
+	resources = &adev->power.states[ACPI_STATE_D0].resources;
+	list_for_each_entry_reverse(entry, resources, node)
+		acpi_power_resource_remove_dependent(entry->resource, dev);
+}
+
 static int __acpi_power_on(struct acpi_power_resource *resource)
 {
+	struct acpi_power_dependent_device *dep;
 	acpi_status status = AE_OK;
 
 	status = acpi_evaluate_object(resource->device.handle, "_ON", NULL, NULL);
@@ -243,6 +362,21 @@ static int __acpi_power_on(struct acpi_power_resource *resource)
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Power resource [%s] turned on\n",
 			  resource->name));
 
+	/*
+	 * If there are other dependents on this power resource we need to
+	 * resume them now so that their drivers can re-initialize the
+	 * hardware properly after it went back to D0.
+	 */
+	if (list_empty(&resource->dependents) ||
+	    list_is_singular(&resource->dependents))
+		return 0;
+
+	list_for_each_entry(dep, &resource->dependents, node) {
+		dev_dbg(dep->dev, "runtime resuming because [%s] turned on\n",
+			resource->name);
+		pm_request_resume(dep->dev);
+	}
+
 	return 0;
 }
 
@@ -810,6 +944,7 @@ int acpi_add_power_resource(acpi_handle handle)
 				ACPI_STA_DEFAULT);
 	mutex_init(&resource->resource_lock);
 	INIT_LIST_HEAD(&resource->list_node);
+	INIT_LIST_HEAD(&resource->dependents);
 	resource->name = device->pnp.bus_id;
 	strcpy(acpi_device_name(device), ACPI_POWER_DEVICE_NAME);
 	strcpy(acpi_device_class(device), ACPI_POWER_CLASS);
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 31b6c87d6240..4752ff0a9d9b 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -513,6 +513,10 @@ int acpi_device_fix_up_power(struct acpi_device *device);
 int acpi_bus_update_power(acpi_handle handle, int *state_p);
 int acpi_device_update_power(struct acpi_device *device, int *state_p);
 bool acpi_bus_power_manageable(acpi_handle handle);
+int acpi_device_power_add_dependent(struct acpi_device *adev,
+				    struct device *dev);
+void acpi_device_power_remove_dependent(struct acpi_device *adev,
+					struct device *dev);
 
 #ifdef CONFIG_PM
 bool acpi_bus_can_wakeup(acpi_handle handle);

From 53b22f900c2d282bf6499712930188cc02306e4e Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 25 Jun 2019 13:29:42 +0300
Subject: [PATCH 7/8] PCI / ACPI: Add _PR0 dependent devices

If otherwise unrelated PCI devices share ACPI power resources turning
them on causes the devices to enter D0uninitialized power state which may
cause problems.

For example in Intel Ice Lake two root ports (RP0 and RP1), Thunderbolt
controller (NHI) and xHCI controller all share power resources as can be
ween in the topology below where power resources are marked with []:

  Host bridge
    |
    +- RP0 ---\
    +- RP1 ---|--+--> [TBT]
    +- NHI --/   |
    |            |
    |            v
    +- xHCI --> [D3C]

In a situation where all devices sharing the power resources are in
D3cold (the power resources are turned off) and for example the
Thunderbolt controller is runtime resumed resulting that the power
resources are turned on. This means that the other devices sharing them
(RP0, RP1 and xHCI) are transitioned into D0uninitialized state. If they
were configured to trigger wake (PME) on a certain event that
configuration gets lost after reset so we would need to re-initialize
them to get the wakeup working as expected again. To do so we would need
to runtime resume all of them to make sure their registers get restored
properly before we can runtime suspend them again.

Since we just added concept of "_PR0 dependent device" we can solve this
by calling the relevant add/remove functions when the PCI device is bind
to its ACPI representation. If it has power resources the PCI device
will be added as dependent device to them and runtime resumed whenever
they are physically turned on. This should make sure PCI core can
reconfigure wakes after the device is transitioned into D0uninitialized.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pci/pci-acpi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index b782acac26c5..2abe0eeafb53 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -902,6 +902,7 @@ static void pci_acpi_setup(struct device *dev)
 		device_wakeup_enable(dev);
 
 	acpi_pci_wakeup(pci_dev, false);
+	acpi_device_power_add_dependent(adev, dev);
 }
 
 static void pci_acpi_cleanup(struct device *dev)
@@ -914,6 +915,7 @@ static void pci_acpi_cleanup(struct device *dev)
 
 	pci_acpi_remove_pm_notifier(adev);
 	if (adev->wakeup.flags.valid) {
+		acpi_device_power_remove_dependent(adev, dev);
 		if (pci_dev->bridge_d3)
 			device_wakeup_disable(dev);
 

From b51033e06c2ebbad322370f4a35c84488e61b342 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 25 Jun 2019 14:09:12 +0200
Subject: [PATCH 8/8] PCI: PM/ACPI: Refresh all stale power state data in
 pci_pm_complete()

In pci_pm_complete() there are checks to decide whether or not to
resume devices that were left in runtime-suspend during the preceding
system-wide transition into a sleep state.  They involve checking the
current power state of the device and comparing it with the power
state of it set before the preceding system-wide transition, but the
platform component of the device's power state is not handled
correctly in there.

Namely, on platforms with ACPI, the device power state information
needs to be updated with care, so that the reference counters of
power resources used by the device (if any) are set to ensure that
the refreshed power state of it will be maintained going forward.

To that end, introduce a new ->refresh_state() platform PM callback
for PCI devices, for asking the platform to refresh the device power
state data and ensure that the corresponding power state will be
maintained going forward, make it invoke acpi_device_update_power()
(for devices with ACPI PM) on platforms with ACPI and make
pci_pm_complete() use it, through a new pci_refresh_power_state()
wrapper function.

Fixes: a0d2a959d3da (PCI: Avoid unnecessary resume after direct-complete)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/pci/pci-acpi.c   |  9 +++++++++
 drivers/pci/pci-driver.c |  9 ++++++++-
 drivers/pci/pci.c        | 21 +++++++++++++++++++++
 drivers/pci/pci.h        |  4 ++++
 4 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 2abe0eeafb53..45049f558860 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -692,6 +692,14 @@ static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev)
 	return state_conv[state];
 }
 
+static void acpi_pci_refresh_power_state(struct pci_dev *dev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
+
+	if (adev && acpi_device_power_manageable(adev))
+		acpi_device_update_power(adev, NULL);
+}
+
 static int acpi_pci_propagate_wakeup(struct pci_bus *bus, bool enable)
 {
 	while (bus->parent) {
@@ -749,6 +757,7 @@ static const struct pci_platform_pm_ops acpi_pci_platform_pm = {
 	.is_manageable = acpi_pci_power_manageable,
 	.set_state = acpi_pci_set_power_state,
 	.get_state = acpi_pci_get_power_state,
+	.refresh_state = acpi_pci_refresh_power_state,
 	.choose_state = acpi_pci_choose_state,
 	.set_wakeup = acpi_pci_wakeup,
 	.need_resume = acpi_pci_need_resume,
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index bd097ea5925c..3149f11ca265 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -710,7 +710,14 @@ static void pci_pm_complete(struct device *dev)
 	if (pm_runtime_suspended(dev) && pm_resume_via_firmware()) {
 		pci_power_t pre_sleep_state = pci_dev->current_state;
 
-		pci_update_current_state(pci_dev, pci_dev->current_state);
+		pci_refresh_power_state(pci_dev);
+		/*
+		 * On platforms with ACPI this check may also trigger for
+		 * devices sharing power resources if one of those power
+		 * resources has been activated as a result of a change of the
+		 * power state of another device sharing it.  However, in that
+		 * case it is also better to resume the device, in general.
+		 */
 		if (pci_dev->current_state < pre_sleep_state)
 			pm_request_resume(dev);
 	}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e34fb2b3c466..b1f563916036 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -777,6 +777,12 @@ static inline pci_power_t platform_pci_get_power_state(struct pci_dev *dev)
 	return pci_platform_pm ? pci_platform_pm->get_state(dev) : PCI_UNKNOWN;
 }
 
+static inline void platform_pci_refresh_power_state(struct pci_dev *dev)
+{
+	if (pci_platform_pm && pci_platform_pm->refresh_state)
+		pci_platform_pm->refresh_state(dev);
+}
+
 static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev)
 {
 	return pci_platform_pm ?
@@ -937,6 +943,21 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
 	}
 }
 
+/**
+ * pci_refresh_power_state - Refresh the given device's power state data
+ * @dev: Target PCI device.
+ *
+ * Ask the platform to refresh the devices power state information and invoke
+ * pci_update_current_state() to update its current PCI power state.
+ */
+void pci_refresh_power_state(struct pci_dev *dev)
+{
+	if (platform_pci_power_manageable(dev))
+		platform_pci_refresh_power_state(dev);
+
+	pci_update_current_state(dev, dev->current_state);
+}
+
 /**
  * pci_power_up - Put the given device into D0 forcibly
  * @dev: PCI device to power up
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 7f9fd93270ed..5db6f985f16d 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -51,6 +51,8 @@ int pci_bus_error_reset(struct pci_dev *dev);
  *
  * @get_state: queries the platform firmware for a device's current power state
  *
+ * @refresh_state: asks the platform to refresh the device's power state data
+ *
  * @choose_state: returns PCI power state of given device preferred by the
  *		  platform; to be used during system-wide transitions from a
  *		  sleeping state to the working state and vice versa
@@ -69,6 +71,7 @@ struct pci_platform_pm_ops {
 	bool (*is_manageable)(struct pci_dev *dev);
 	int (*set_state)(struct pci_dev *dev, pci_power_t state);
 	pci_power_t (*get_state)(struct pci_dev *dev);
+	void (*refresh_state)(struct pci_dev *dev);
 	pci_power_t (*choose_state)(struct pci_dev *dev);
 	int (*set_wakeup)(struct pci_dev *dev, bool enable);
 	bool (*need_resume)(struct pci_dev *dev);
@@ -76,6 +79,7 @@ struct pci_platform_pm_ops {
 
 int pci_set_platform_pm(const struct pci_platform_pm_ops *ops);
 void pci_update_current_state(struct pci_dev *dev, pci_power_t state);
+void pci_refresh_power_state(struct pci_dev *dev);
 void pci_power_up(struct pci_dev *dev);
 void pci_disable_enabled_device(struct pci_dev *dev);
 int pci_finish_runtime_suspend(struct pci_dev *dev);