From 234f223d63d8f7db64a682ccf02871d40d38db52 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 7 Jun 2019 00:30:58 +0200 Subject: [PATCH 1/8] PCI: PM: Avoid resuming devices in D3hot during system suspend The current code resumes devices in D3hot during system suspend if the target power state for them is D3cold, but that is not necessary in general. It only is necessary to do that if the platform firmware requires the device to be resumed, but that should be covered by the platform_pci_need_resume() check anyway, so rework pci_dev_keep_suspended() to avoid returning 'false' for devices in D3hot which need not be resumed due to platform firmware requirements. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mika Westerberg --- drivers/pci/pci.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 8abc843b1615..324175509e94 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2474,10 +2474,20 @@ bool pci_dev_keep_suspended(struct pci_dev *pci_dev) { struct device *dev = &pci_dev->dev; bool wakeup = device_may_wakeup(dev); + pci_power_t target_state; - if (!pm_runtime_suspended(dev) - || pci_target_state(pci_dev, wakeup) != pci_dev->current_state - || platform_pci_need_resume(pci_dev)) + if (!pm_runtime_suspended(dev) || platform_pci_need_resume(pci_dev)) + return false; + + target_state = pci_target_state(pci_dev, wakeup); + + /* + * If the earlier platform check has not triggered, D3cold is just power + * removal on top of D3hot, so no need to resume the device in that + * case. + */ + if (target_state != pci_dev->current_state && + target_state != PCI_D3cold && pci_dev->current_state != PCI_D3hot) return false; /* From 0c7376ada9508141becec9b897d73b65ce66a15a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 7 Jun 2019 00:32:31 +0200 Subject: [PATCH 2/8] PCI: PM: Replace pci_dev_keep_suspended() with two functions The code in pci_dev_keep_suspended() is relatively hard to follow due to the negative checks in it and in its callers and the function has a possible side-effect (disabling the PME) which doesn't really match its role. For this reason, move the PME disabling from pci_dev_keep_suspended() to a separate function and change the semantics (and name) of the rest of it, so that 'true' is returned when the device needs to be resumed (and not the other way around). Change the callers of pci_dev_keep_suspended() accordingly. While at it, make the code flow in pci_pm_poweroff() reflect the pci_pm_suspend() more closely to avoid arbitrary differences between them. This is a cosmetic change with no intention to alter behavior. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mika Westerberg --- drivers/pci/pci-driver.c | 22 +++++++++++++--- drivers/pci/pci.c | 55 ++++++++++++++++++++-------------------- drivers/pci/pci.h | 3 ++- 3 files changed, 48 insertions(+), 32 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 5eadbc3d0969..dd656cf19a12 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -679,6 +679,7 @@ static bool pci_has_legacy_pm_support(struct pci_dev *pci_dev) static int pci_pm_prepare(struct device *dev) { struct device_driver *drv = dev->driver; + struct pci_dev *pci_dev = to_pci_dev(dev); if (drv && drv->pm && drv->pm->prepare) { int error = drv->pm->prepare(dev); @@ -688,7 +689,15 @@ static int pci_pm_prepare(struct device *dev) if (!error && dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_PREPARE)) return 0; } - return pci_dev_keep_suspended(to_pci_dev(dev)); + if (pci_dev_need_resume(pci_dev)) + return 0; + + /* + * The PME setting needs to be adjusted here in case the direct-complete + * optimization is used with respect to this device. + */ + pci_dev_adjust_pme(pci_dev); + return 1; } static void pci_pm_complete(struct device *dev) @@ -758,9 +767,11 @@ static int pci_pm_suspend(struct device *dev) * better to resume the device from runtime suspend here. */ if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) || - !pci_dev_keep_suspended(pci_dev)) { + pci_dev_need_resume(pci_dev)) { pm_runtime_resume(dev); pci_dev->state_saved = false; + } else { + pci_dev_adjust_pme(pci_dev); } if (pm->suspend) { @@ -1108,10 +1119,13 @@ static int pci_pm_poweroff(struct device *dev) /* The reason to do that is the same as in pci_pm_suspend(). */ if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) || - !pci_dev_keep_suspended(pci_dev)) + pci_dev_need_resume(pci_dev)) { pm_runtime_resume(dev); + pci_dev->state_saved = false; + } else { + pci_dev_adjust_pme(pci_dev); + } - pci_dev->state_saved = false; if (pm->poweroff) { int error; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 324175509e94..24d86bd1bba1 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2459,55 +2459,56 @@ bool pci_dev_run_wake(struct pci_dev *dev) EXPORT_SYMBOL_GPL(pci_dev_run_wake); /** - * pci_dev_keep_suspended - Check if the device can stay in the suspended state. + * pci_dev_need_resume - Check if it is necessary to resume the device. * @pci_dev: Device to check. * - * Return 'true' if the device is runtime-suspended, it doesn't have to be + * Return 'true' if the device is not runtime-suspended or it has to be * reconfigured due to wakeup settings difference between system and runtime - * suspend and the current power state of it is suitable for the upcoming - * (system) transition. - * - * If the device is not configured for system wakeup, disable PME for it before - * returning 'true' to prevent it from waking up the system unnecessarily. + * suspend, or the current power state of it is not suitable for the upcoming + * (system-wide) transition. */ -bool pci_dev_keep_suspended(struct pci_dev *pci_dev) +bool pci_dev_need_resume(struct pci_dev *pci_dev) { struct device *dev = &pci_dev->dev; - bool wakeup = device_may_wakeup(dev); pci_power_t target_state; if (!pm_runtime_suspended(dev) || platform_pci_need_resume(pci_dev)) - return false; + return true; - target_state = pci_target_state(pci_dev, wakeup); + target_state = pci_target_state(pci_dev, device_may_wakeup(dev)); /* * If the earlier platform check has not triggered, D3cold is just power * removal on top of D3hot, so no need to resume the device in that * case. */ - if (target_state != pci_dev->current_state && - target_state != PCI_D3cold && pci_dev->current_state != PCI_D3hot) - return false; + return target_state != pci_dev->current_state && + target_state != PCI_D3cold && + pci_dev->current_state != PCI_D3hot; +} + +/** + * pci_dev_adjust_pme - Adjust PME setting for a suspended device. + * @pci_dev: Device to check. + * + * If the device is suspended and it is not configured for system wakeup, + * disable PME for it to prevent it from waking up the system unnecessarily. + * + * Note that if the device's power state is D3cold and the platform check in + * pci_dev_need_resume() has not triggered, the device's configuration need not + * be changed. + */ +void pci_dev_adjust_pme(struct pci_dev *pci_dev) +{ + struct device *dev = &pci_dev->dev; - /* - * At this point the device is good to go unless it's been configured - * to generate PME at the runtime suspend time, but it is not supposed - * to wake up the system. In that case, simply disable PME for it - * (it will have to be re-enabled on exit from system resume). - * - * If the device's power state is D3cold and the platform check above - * hasn't triggered, the device's configuration is suitable and we don't - * need to manipulate it at all. - */ spin_lock_irq(&dev->power.lock); - if (pm_runtime_suspended(dev) && pci_dev->current_state < PCI_D3cold && - !wakeup) + if (pm_runtime_suspended(dev) && !device_may_wakeup(dev) && + pci_dev->current_state < PCI_D3cold) __pci_pme_active(pci_dev, false); spin_unlock_irq(&dev->power.lock); - return true; } /** diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 9cb99380c61e..e04fa7fd3e2f 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -82,7 +82,8 @@ int pci_finish_runtime_suspend(struct pci_dev *dev); void pcie_clear_root_pme_status(struct pci_dev *dev); int __pci_pme_wakeup(struct pci_dev *dev, void *ign); void pci_pme_restore(struct pci_dev *dev); -bool pci_dev_keep_suspended(struct pci_dev *dev); +bool pci_dev_need_resume(struct pci_dev *dev); +void pci_dev_adjust_pme(struct pci_dev *dev); void pci_dev_complete_resume(struct pci_dev *pci_dev); void pci_config_pm_runtime_get(struct pci_dev *dev); void pci_config_pm_runtime_put(struct pci_dev *dev); From c2bf1fc212f7e6f25ace1af8f0b3ac061ea48ba5 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 12 Jun 2019 13:57:38 +0300 Subject: [PATCH 3/8] PCI: Add missing link delays required by the PCIe spec Currently Linux does not follow PCIe spec regarding the required delays after reset. A concrete example is a Thunderbolt add-in-card that consists of a PCIe switch and two PCIe endpoints: +-1b.0-[01-6b]----00.0-[02-6b]--+-00.0-[03]----00.0 TBT controller +-01.0-[04-36]-- DS hotplug port +-02.0-[37]----00.0 xHCI controller \-04.0-[38-6b]-- DS hotplug port The root port (1b.0) and the PCIe switch downstream ports are all PCIe gen3 so they support 8GT/s link speeds. We wait for the PCIe hierarchy to enter D3cold (runtime): pcieport 0000:00:1b.0: power state changed by ACPI to D3cold When it wakes up from D3cold, according to the PCIe 4.0 section 5.8 the PCIe switch is put to reset and its power is re-applied. This means that we must follow the rules in PCIe 4.0 section 6.6.1. For the PCIe gen3 ports we are dealing with here, the following applies: With a Downstream Port that supports Link speeds greater than 5.0 GT/s, software must wait a minimum of 100 ms after Link training completes before sending a Configuration Request to the device immediately below that Port. Software can determine when Link training completes by polling the Data Link Layer Link Active bit or by setting up an associated interrupt (see Section 6.7.3.3). Translating this into the above topology we would need to do this (DLLLA stands for Data Link Layer Link Active): pcieport 0000:00:1b.0: wait for 100ms after DLLLA is set before access to 0000:01:00.0 pcieport 0000:02:00.0: wait for 100ms after DLLLA is set before access to 0000:03:00.0 pcieport 0000:02:02.0: wait for 100ms after DLLLA is set before access to 0000:37:00.0 I've instrumented the kernel with additional logging so we can see the actual delays the kernel performs: pcieport 0000:00:1b.0: power state changed by ACPI to D0 pcieport 0000:00:1b.0: waiting for D3cold delay of 100 ms pcieport 0000:00:1b.0: waking up bus pcieport 0000:00:1b.0: waiting for D3hot delay of 10 ms pcieport 0000:00:1b.0: restoring config space at offset 0x2c (was 0x60, writing 0x60) ... pcieport 0000:00:1b.0: PME# disabled pcieport 0000:01:00.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff) ... pcieport 0000:01:00.0: PME# disabled pcieport 0000:02:00.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff) ... pcieport 0000:02:00.0: PME# disabled pcieport 0000:02:01.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff) ... pcieport 0000:02:01.0: restoring config space at offset 0x4 (was 0x100000, writing 0x100407) pcieport 0000:02:01.0: PME# disabled pcieport 0000:02:02.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff) ... pcieport 0000:02:02.0: PME# disabled pcieport 0000:02:04.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff) ... pcieport 0000:02:04.0: PME# disabled pcieport 0000:02:01.0: PME# enabled pcieport 0000:02:01.0: waiting for D3hot delay of 10 ms pcieport 0000:02:04.0: PME# enabled pcieport 0000:02:04.0: waiting for D3hot delay of 10 ms thunderbolt 0000:03:00.0: restoring config space at offset 0x14 (was 0x0, writing 0x8a040000) ... thunderbolt 0000:03:00.0: PME# disabled xhci_hcd 0000:37:00.0: restoring config space at offset 0x10 (was 0x0, writing 0x73f00000) ... xhci_hcd 0000:37:00.0: PME# disabled For the switch upstream port (01:00.0) we wait for 100ms but not taking into account the DLLLA requirement. We then wait 10ms for D3hot -> D0 transition of the root port and the two downstream hotplug ports. This means that we deviate from what the spec requires. Performing the same check for system sleep (s2idle) transitions we can see following when resuming from s2idle: pcieport 0000:00:1b.0: power state changed by ACPI to D0 pcieport 0000:00:1b.0: restoring config space at offset 0x2c (was 0x60, writing 0x60) ... pcieport 0000:01:00.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff) ... pcieport 0000:02:02.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff) pcieport 0000:02:02.0: restoring config space at offset 0x2c (was 0x0, writing 0x0) pcieport 0000:02:01.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff) pcieport 0000:02:04.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff) pcieport 0000:02:02.0: restoring config space at offset 0x28 (was 0x0, writing 0x0) pcieport 0000:02:00.0: restoring config space at offset 0x3c (was 0x1ff, writing 0x201ff) pcieport 0000:02:02.0: restoring config space at offset 0x24 (was 0x10001, writing 0x1fff1) pcieport 0000:02:01.0: restoring config space at offset 0x2c (was 0x0, writing 0x60) pcieport 0000:02:02.0: restoring config space at offset 0x20 (was 0x0, writing 0x73f073f0) pcieport 0000:02:04.0: restoring config space at offset 0x2c (was 0x0, writing 0x60) pcieport 0000:02:01.0: restoring config space at offset 0x28 (was 0x0, writing 0x60) pcieport 0000:02:00.0: restoring config space at offset 0x2c (was 0x0, writing 0x0) pcieport 0000:02:02.0: restoring config space at offset 0x1c (was 0x101, writing 0x1f1) pcieport 0000:02:04.0: restoring config space at offset 0x28 (was 0x0, writing 0x60) pcieport 0000:02:01.0: restoring config space at offset 0x24 (was 0x10001, writing 0x1ff10001) pcieport 0000:02:00.0: restoring config space at offset 0x28 (was 0x0, writing 0x0) pcieport 0000:02:02.0: restoring config space at offset 0x18 (was 0x0, writing 0x373702) pcieport 0000:02:04.0: restoring config space at offset 0x24 (was 0x10001, writing 0x49f12001) pcieport 0000:02:01.0: restoring config space at offset 0x20 (was 0x0, writing 0x73e05c00) pcieport 0000:02:00.0: restoring config space at offset 0x24 (was 0x10001, writing 0x1fff1) pcieport 0000:02:04.0: restoring config space at offset 0x20 (was 0x0, writing 0x89f07400) pcieport 0000:02:01.0: restoring config space at offset 0x1c (was 0x101, writing 0x5151) pcieport 0000:02:00.0: restoring config space at offset 0x20 (was 0x0, writing 0x8a008a00) pcieport 0000:02:02.0: restoring config space at offset 0xc (was 0x10000, writing 0x10020) pcieport 0000:02:04.0: restoring config space at offset 0x1c (was 0x101, writing 0x6161) pcieport 0000:02:01.0: restoring config space at offset 0x18 (was 0x0, writing 0x360402) pcieport 0000:02:00.0: restoring config space at offset 0x1c (was 0x101, writing 0x1f1) pcieport 0000:02:04.0: restoring config space at offset 0x18 (was 0x0, writing 0x6b3802) pcieport 0000:02:02.0: restoring config space at offset 0x4 (was 0x100000, writing 0x100407) pcieport 0000:02:00.0: restoring config space at offset 0x18 (was 0x0, writing 0x30302) pcieport 0000:02:01.0: restoring config space at offset 0xc (was 0x10000, writing 0x10020) pcieport 0000:02:04.0: restoring config space at offset 0xc (was 0x10000, writing 0x10020) pcieport 0000:02:00.0: restoring config space at offset 0xc (was 0x10000, writing 0x10020) pcieport 0000:02:01.0: restoring config space at offset 0x4 (was 0x100000, writing 0x100407) pcieport 0000:02:04.0: restoring config space at offset 0x4 (was 0x100000, writing 0x100407) pcieport 0000:02:00.0: restoring config space at offset 0x4 (was 0x100000, writing 0x100407) xhci_hcd 0000:37:00.0: restoring config space at offset 0x10 (was 0x0, writing 0x73f00000) ... thunderbolt 0000:03:00.0: restoring config space at offset 0x14 (was 0x0, writing 0x8a040000) This is even worse. None of the mandatory delays are performed. If this would be S3 instead of s2idle then according to PCI FW spec 3.2 section 4.6.8. there is a specific _DSM that allows the OS to skip the delays but this platform does not provide the _DSM and does not go to S3 anyway so no firmware is involved that could already handle these delays. In this particular Intel Coffee Lake platform these delays are not actually needed because there is an additional delay as part of the ACPI power resource that is used to turn on power to the hierarchy but since that additional delay is not required by any of standards (PCIe, ACPI) it is not present in the Intel Ice Lake, for example where missing the mandatory delays causes pciehp to start tearing down the stack too early (links are not yet trained). For this reason, change the PCIe portdrv PM resume hooks so that they perform the mandatory delays before the downstream component gets resumed. We perform the delays before port services are resumed because otherwise pciehp might find that the link is not up (even if it is just training) and tears-down the hierarchy. Signed-off-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/pci/pci.c | 29 ++++++++++----- drivers/pci/pci.h | 1 + drivers/pci/pcie/portdrv_core.c | 66 +++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 10 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 24d86bd1bba1..9839d6f9bcb5 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1004,15 +1004,10 @@ static void __pci_start_power_transition(struct pci_dev *dev, pci_power_t state) if (state == PCI_D0) { pci_platform_power_transition(dev, PCI_D0); /* - * Mandatory power management transition delays, see - * PCI Express Base Specification Revision 2.0 Section - * 6.6.1: Conventional Reset. Do not delay for - * devices powered on/off by corresponding bridge, - * because have already delayed for the bridge. + * Mandatory power management transition delays are + * handled in the PCIe portdrv resume hooks. */ if (dev->runtime_d3cold) { - if (dev->d3cold_delay && !dev->imm_ready) - msleep(dev->d3cold_delay); /* * When powering on a bridge from D3cold, the * whole hierarchy may be powered on into @@ -4579,14 +4574,16 @@ static int pci_pm_reset(struct pci_dev *dev, int probe) return pci_dev_wait(dev, "PM D3->D0", PCIE_RESET_READY_POLL_MS); } + /** - * pcie_wait_for_link - Wait until link is active or inactive + * pcie_wait_for_link_delay - Wait until link is active or inactive * @pdev: Bridge device * @active: waiting for active or inactive? + * @delay: Delay to wait after link has become active (in ms) * * Use this to wait till link becomes active or inactive. */ -bool pcie_wait_for_link(struct pci_dev *pdev, bool active) +bool pcie_wait_for_link_delay(struct pci_dev *pdev, bool active, int delay) { int timeout = 1000; bool ret; @@ -4623,13 +4620,25 @@ bool pcie_wait_for_link(struct pci_dev *pdev, bool active) timeout -= 10; } if (active && ret) - msleep(100); + msleep(delay); else if (ret != active) pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n", active ? "set" : "cleared"); return ret == active; } +/** + * pcie_wait_for_link - Wait until link is active or inactive + * @pdev: Bridge device + * @active: waiting for active or inactive? + * + * Use this to wait till link becomes active or inactive. + */ +bool pcie_wait_for_link(struct pci_dev *pdev, bool active) +{ + return pcie_wait_for_link_delay(pdev, active, 100); +} + void pci_reset_secondary_bus(struct pci_dev *dev) { u16 ctrl; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index e04fa7fd3e2f..7f9fd93270ed 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -494,6 +494,7 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev) void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state, u32 service); +bool pcie_wait_for_link_delay(struct pci_dev *pdev, bool active, int delay); bool pcie_wait_for_link(struct pci_dev *pdev, bool active); #ifdef CONFIG_PCIEASPM void pcie_aspm_init_link_state(struct pci_dev *pdev); diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 1b330129089f..308c3e0c4a34 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -378,6 +379,67 @@ static int pm_iter(struct device *dev, void *data) return 0; } +static int get_downstream_delay(struct pci_bus *bus) +{ + struct pci_dev *pdev; + int min_delay = 100; + int max_delay = 0; + + list_for_each_entry(pdev, &bus->devices, bus_list) { + if (!pdev->imm_ready) + min_delay = 0; + else if (pdev->d3cold_delay < min_delay) + min_delay = pdev->d3cold_delay; + if (pdev->d3cold_delay > max_delay) + max_delay = pdev->d3cold_delay; + } + + return max(min_delay, max_delay); +} + +/* + * wait_for_downstream_link - Wait for downstream link to establish + * @pdev: PCIe port whose downstream link is waited + * + * Handle delays according to PCIe 4.0 section 6.6.1 before configuration + * access to the downstream component is permitted. + * + * This blocks PCI core resume of the hierarchy below this port until the + * link is trained. Should be called before resuming port services to + * prevent pciehp from starting to tear-down the hierarchy too soon. + */ +static void wait_for_downstream_link(struct pci_dev *pdev) +{ + int delay; + + if (pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT && + pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM) + return; + + if (pci_dev_is_disconnected(pdev)) + return; + + if (!pdev->subordinate || list_empty(&pdev->subordinate->devices) || + !pdev->bridge_d3) + return; + + delay = get_downstream_delay(pdev->subordinate); + if (!delay) + return; + + dev_dbg(&pdev->dev, "waiting downstream link for %d ms\n", delay); + + /* + * If downstream port does not support speeds greater than 5 GT/s + * need to wait 100ms. For higher speeds (gen3) we need to wait + * first for the data link layer to become active. + */ + if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) + msleep(delay); + else + pcie_wait_for_link_delay(pdev, true, delay); +} + /** * pcie_port_device_suspend - suspend port services associated with a PCIe port * @dev: PCI Express port to handle @@ -391,6 +453,8 @@ int pcie_port_device_suspend(struct device *dev) int pcie_port_device_resume_noirq(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, resume_noirq); + + wait_for_downstream_link(to_pci_dev(dev)); return device_for_each_child(dev, &off, pm_iter); } @@ -421,6 +485,8 @@ int pcie_port_device_runtime_suspend(struct device *dev) int pcie_port_device_runtime_resume(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, runtime_resume); + + wait_for_downstream_link(to_pci_dev(dev)); return device_for_each_child(dev, &off, pm_iter); } #endif /* PM */ From 000dd5316e1c756a1c028f22e01d06a38249dd4d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 12 Jun 2019 13:57:39 +0300 Subject: [PATCH 4/8] PCI: Do not poll for PME if the device is in D3cold PME polling does not take into account that a device that is directly connected to the host bridge may go into D3cold as well. This leads to a situation where the PME poll thread reads from a config space of a device that is in D3cold and gets incorrect information because the config space is not accessible. Here is an example from Intel Ice Lake system where two PCIe root ports are in D3cold (I've instrumented the kernel to log the PMCSR register contents): [ 62.971442] pcieport 0000:00:07.1: Check PME status, PMCSR=0xffff [ 62.971504] pcieport 0000:00:07.0: Check PME status, PMCSR=0xffff Since 0xffff is interpreted so that PME is pending, the root ports will be runtime resumed. This repeats over and over again essentially blocking all runtime power management. Prevent this from happening by checking whether the device is in D3cold before its PME status is read. Fixes: 71a83bd727cc ("PCI/PM: add runtime PM support to PCIe port") Signed-off-by: Mika Westerberg Reviewed-by: Lukas Wunner Cc: 3.6+ # v3.6+ Signed-off-by: Rafael J. Wysocki --- drivers/pci/pci.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 9839d6f9bcb5..e34fb2b3c466 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2060,6 +2060,13 @@ static void pci_pme_list_scan(struct work_struct *work) */ if (bridge && bridge->current_state != PCI_D0) continue; + /* + * If the device is in D3cold it should not be + * polled either. + */ + if (pme_dev->dev->current_state == PCI_D3cold) + continue; + pci_pme_wakeup(pme_dev->dev, NULL); } else { list_del(&pme_dev->list); From 83a16e3f6d70da99896c7a2639c0b60fff13afb8 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 25 Jun 2019 13:29:40 +0300 Subject: [PATCH 5/8] PCI / ACPI: Use cached ACPI device state to get PCI device power state The ACPI power state returned by acpi_device_get_power() may depend on the configuration of ACPI power resources in the system which may change any time after acpi_device_get_power() has returned, unless the reference counters of the ACPI power resources in question are set to prevent that from happening. Thus it is invalid to use acpi_device_get_power() in acpi_pci_get_power_state() the way it is done now and the value of the ->power.state field in the corresponding struct acpi_device objects (which reflects the ACPI power resources reference counting, among other things) should be used instead. As an example where this becomes an issue is Intel Ice Lake where the Thunderbolt controller (NHI), two PCIe root ports (RP0 and RP1) and xHCI all share the same power resources. The following picture with power resources marked with [] shows the topology: Host bridge | +- RP0 ---\ +- RP1 ---|--+--> [TBT] +- NHI --/ | | | | v +- xHCI --> [D3C] Here TBT and D3C are the shared ACPI power resources. ACPI _PR3() method of the devices in question returns either TBT or D3C or both. Say we runtime suspend first the root ports RP0 and RP1, then NHI. Now since the TBT power resource is still on when the root ports are runtime suspended their dev->current_state is set to D3hot. When NHI is runtime suspended TBT is finally turned off but state of the root ports remain to be D3hot. Now when the xHCI is runtime suspended D3C gets also turned off. PCI core thus has power states of these devices cached in their dev->current_state as follows: RP0 -> D3hot RP1 -> D3hot NHI -> D3cold xHCI -> D3cold If the user now runs lspci for instance, the result is all 1's like in the below output (00:07.0 is the first root port, RP0): 00:07.0 PCI bridge: Intel Corporation Device 8a1d (rev ff) (prog-if ff) !!! Unknown header type 7f Kernel driver in use: pcieport In short the hardware state is not in sync with the software state anymore. The exact same thing happens with the PME polling thread which ends up bringing the root ports back into D0 after they are runtime suspended. For this reason, modify acpi_pci_get_power_state() so that it uses the ACPI device power state that was cached by the ACPI core. This makes the PCI device power state match the ACPI device power state regardless of state of the shared power resources which may still be on at this point. Link: https://lore.kernel.org/r/20190618161858.77834-2-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/pci/pci-acpi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 1897847ceb0c..b782acac26c5 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -685,7 +685,8 @@ static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev) if (!adev || !acpi_device_power_manageable(adev)) return PCI_UNKNOWN; - if (acpi_device_get_power(adev, &state) || state == ACPI_STATE_UNKNOWN) + state = adev->power.state; + if (state == ACPI_STATE_UNKNOWN) return PCI_UNKNOWN; return state_conv[state]; From 4533771c1e53b921f66e580135ee64a76986a491 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 25 Jun 2019 13:29:41 +0300 Subject: [PATCH 6/8] ACPI / PM: Introduce concept of a _PR0 dependent device If there are shared power resources between otherwise unrelated devices turning them on causes the other devices sharing them to be powered up as well. In case of PCI devices go into D0uninitialized state meaning that if they were configured to trigger wake that configuration is lost at this point. For this reason introduce a concept of "_PR0 dependent device" that can be added to any ACPI device that has power resources. The dependent device will be included in a list of dependent devices for all power resources returned by the ACPI device's _PR0 (assuming it has one). Whenever a power resource having dependent devices is turned physically on (its _ON method is called) we runtime resume all of them to allow their driver or in case of PCI the PCI core to re-initialize the device and its wake configuration. This adds two functions that can be used to add and remove these dependent devices. Note the dependent device does not necessary need share power resources so this functionality can be used to add "software dependencies" as well if needed. Signed-off-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/power.c | 135 ++++++++++++++++++++++++++++++++++++++++ include/acpi/acpi_bus.h | 4 ++ 2 files changed, 139 insertions(+) diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index a916417b9e70..fe1e7bc91a5e 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -42,6 +42,11 @@ ACPI_MODULE_NAME("power"); #define ACPI_POWER_RESOURCE_STATE_ON 0x01 #define ACPI_POWER_RESOURCE_STATE_UNKNOWN 0xFF +struct acpi_power_dependent_device { + struct device *dev; + struct list_head node; +}; + struct acpi_power_resource { struct acpi_device device; struct list_head list_node; @@ -51,6 +56,7 @@ struct acpi_power_resource { unsigned int ref_count; bool wakeup_enabled; struct mutex resource_lock; + struct list_head dependents; }; struct acpi_power_resource_entry { @@ -232,8 +238,121 @@ static int acpi_power_get_list_state(struct list_head *list, int *state) return 0; } +static int +acpi_power_resource_add_dependent(struct acpi_power_resource *resource, + struct device *dev) +{ + struct acpi_power_dependent_device *dep; + int ret = 0; + + mutex_lock(&resource->resource_lock); + list_for_each_entry(dep, &resource->dependents, node) { + /* Only add it once */ + if (dep->dev == dev) + goto unlock; + } + + dep = kzalloc(sizeof(*dep), GFP_KERNEL); + if (!dep) { + ret = -ENOMEM; + goto unlock; + } + + dep->dev = dev; + list_add_tail(&dep->node, &resource->dependents); + dev_dbg(dev, "added power dependency to [%s]\n", resource->name); + +unlock: + mutex_unlock(&resource->resource_lock); + return ret; +} + +static void +acpi_power_resource_remove_dependent(struct acpi_power_resource *resource, + struct device *dev) +{ + struct acpi_power_dependent_device *dep; + + mutex_lock(&resource->resource_lock); + list_for_each_entry(dep, &resource->dependents, node) { + if (dep->dev == dev) { + list_del(&dep->node); + kfree(dep); + dev_dbg(dev, "removed power dependency to [%s]\n", + resource->name); + break; + } + } + mutex_unlock(&resource->resource_lock); +} + +/** + * acpi_device_power_add_dependent - Add dependent device of this ACPI device + * @adev: ACPI device pointer + * @dev: Dependent device + * + * If @adev has non-empty _PR0 the @dev is added as dependent device to all + * power resources returned by it. This means that whenever these power + * resources are turned _ON the dependent devices get runtime resumed. This + * is needed for devices such as PCI to allow its driver to re-initialize + * it after it went to D0uninitialized. + * + * If @adev does not have _PR0 this does nothing. + * + * Returns %0 in case of success and negative errno otherwise. + */ +int acpi_device_power_add_dependent(struct acpi_device *adev, + struct device *dev) +{ + struct acpi_power_resource_entry *entry; + struct list_head *resources; + int ret; + + if (!adev->flags.power_manageable) + return 0; + + resources = &adev->power.states[ACPI_STATE_D0].resources; + list_for_each_entry(entry, resources, node) { + ret = acpi_power_resource_add_dependent(entry->resource, dev); + if (ret) + goto err; + } + + return 0; + +err: + list_for_each_entry(entry, resources, node) + acpi_power_resource_remove_dependent(entry->resource, dev); + + return ret; +} + +/** + * acpi_device_power_remove_dependent - Remove dependent device + * @adev: ACPI device pointer + * @dev: Dependent device + * + * Does the opposite of acpi_device_power_add_dependent() and removes the + * dependent device if it is found. Can be called to @adev that does not + * have _PR0 as well. + */ +void acpi_device_power_remove_dependent(struct acpi_device *adev, + struct device *dev) +{ + struct acpi_power_resource_entry *entry; + struct list_head *resources; + + if (!adev->flags.power_manageable) + return; + + resources = &adev->power.states[ACPI_STATE_D0].resources; + list_for_each_entry_reverse(entry, resources, node) + acpi_power_resource_remove_dependent(entry->resource, dev); +} + static int __acpi_power_on(struct acpi_power_resource *resource) { + struct acpi_power_dependent_device *dep; acpi_status status = AE_OK; status = acpi_evaluate_object(resource->device.handle, "_ON", NULL, NULL); @@ -243,6 +362,21 @@ static int __acpi_power_on(struct acpi_power_resource *resource) ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Power resource [%s] turned on\n", resource->name)); + /* + * If there are other dependents on this power resource we need to + * resume them now so that their drivers can re-initialize the + * hardware properly after it went back to D0. + */ + if (list_empty(&resource->dependents) || + list_is_singular(&resource->dependents)) + return 0; + + list_for_each_entry(dep, &resource->dependents, node) { + dev_dbg(dep->dev, "runtime resuming because [%s] turned on\n", + resource->name); + pm_request_resume(dep->dev); + } + return 0; } @@ -810,6 +944,7 @@ int acpi_add_power_resource(acpi_handle handle) ACPI_STA_DEFAULT); mutex_init(&resource->resource_lock); INIT_LIST_HEAD(&resource->list_node); + INIT_LIST_HEAD(&resource->dependents); resource->name = device->pnp.bus_id; strcpy(acpi_device_name(device), ACPI_POWER_DEVICE_NAME); strcpy(acpi_device_class(device), ACPI_POWER_CLASS); diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 31b6c87d6240..4752ff0a9d9b 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -513,6 +513,10 @@ int acpi_device_fix_up_power(struct acpi_device *device); int acpi_bus_update_power(acpi_handle handle, int *state_p); int acpi_device_update_power(struct acpi_device *device, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); +int acpi_device_power_add_dependent(struct acpi_device *adev, + struct device *dev); +void acpi_device_power_remove_dependent(struct acpi_device *adev, + struct device *dev); #ifdef CONFIG_PM bool acpi_bus_can_wakeup(acpi_handle handle); From 53b22f900c2d282bf6499712930188cc02306e4e Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 25 Jun 2019 13:29:42 +0300 Subject: [PATCH 7/8] PCI / ACPI: Add _PR0 dependent devices If otherwise unrelated PCI devices share ACPI power resources turning them on causes the devices to enter D0uninitialized power state which may cause problems. For example in Intel Ice Lake two root ports (RP0 and RP1), Thunderbolt controller (NHI) and xHCI controller all share power resources as can be ween in the topology below where power resources are marked with []: Host bridge | +- RP0 ---\ +- RP1 ---|--+--> [TBT] +- NHI --/ | | | | v +- xHCI --> [D3C] In a situation where all devices sharing the power resources are in D3cold (the power resources are turned off) and for example the Thunderbolt controller is runtime resumed resulting that the power resources are turned on. This means that the other devices sharing them (RP0, RP1 and xHCI) are transitioned into D0uninitialized state. If they were configured to trigger wake (PME) on a certain event that configuration gets lost after reset so we would need to re-initialize them to get the wakeup working as expected again. To do so we would need to runtime resume all of them to make sure their registers get restored properly before we can runtime suspend them again. Since we just added concept of "_PR0 dependent device" we can solve this by calling the relevant add/remove functions when the PCI device is bind to its ACPI representation. If it has power resources the PCI device will be added as dependent device to them and runtime resumed whenever they are physically turned on. This should make sure PCI core can reconfigure wakes after the device is transitioned into D0uninitialized. Signed-off-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/pci/pci-acpi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index b782acac26c5..2abe0eeafb53 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -902,6 +902,7 @@ static void pci_acpi_setup(struct device *dev) device_wakeup_enable(dev); acpi_pci_wakeup(pci_dev, false); + acpi_device_power_add_dependent(adev, dev); } static void pci_acpi_cleanup(struct device *dev) @@ -914,6 +915,7 @@ static void pci_acpi_cleanup(struct device *dev) pci_acpi_remove_pm_notifier(adev); if (adev->wakeup.flags.valid) { + acpi_device_power_remove_dependent(adev, dev); if (pci_dev->bridge_d3) device_wakeup_disable(dev); From b51033e06c2ebbad322370f4a35c84488e61b342 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Jun 2019 14:09:12 +0200 Subject: [PATCH 8/8] PCI: PM/ACPI: Refresh all stale power state data in pci_pm_complete() In pci_pm_complete() there are checks to decide whether or not to resume devices that were left in runtime-suspend during the preceding system-wide transition into a sleep state. They involve checking the current power state of the device and comparing it with the power state of it set before the preceding system-wide transition, but the platform component of the device's power state is not handled correctly in there. Namely, on platforms with ACPI, the device power state information needs to be updated with care, so that the reference counters of power resources used by the device (if any) are set to ensure that the refreshed power state of it will be maintained going forward. To that end, introduce a new ->refresh_state() platform PM callback for PCI devices, for asking the platform to refresh the device power state data and ensure that the corresponding power state will be maintained going forward, make it invoke acpi_device_update_power() (for devices with ACPI PM) on platforms with ACPI and make pci_pm_complete() use it, through a new pci_refresh_power_state() wrapper function. Fixes: a0d2a959d3da (PCI: Avoid unnecessary resume after direct-complete) Signed-off-by: Rafael J. Wysocki Reviewed-by: Mika Westerberg --- drivers/pci/pci-acpi.c | 9 +++++++++ drivers/pci/pci-driver.c | 9 ++++++++- drivers/pci/pci.c | 21 +++++++++++++++++++++ drivers/pci/pci.h | 4 ++++ 4 files changed, 42 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 2abe0eeafb53..45049f558860 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -692,6 +692,14 @@ static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev) return state_conv[state]; } +static void acpi_pci_refresh_power_state(struct pci_dev *dev) +{ + struct acpi_device *adev = ACPI_COMPANION(&dev->dev); + + if (adev && acpi_device_power_manageable(adev)) + acpi_device_update_power(adev, NULL); +} + static int acpi_pci_propagate_wakeup(struct pci_bus *bus, bool enable) { while (bus->parent) { @@ -749,6 +757,7 @@ static const struct pci_platform_pm_ops acpi_pci_platform_pm = { .is_manageable = acpi_pci_power_manageable, .set_state = acpi_pci_set_power_state, .get_state = acpi_pci_get_power_state, + .refresh_state = acpi_pci_refresh_power_state, .choose_state = acpi_pci_choose_state, .set_wakeup = acpi_pci_wakeup, .need_resume = acpi_pci_need_resume, diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index bd097ea5925c..3149f11ca265 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -710,7 +710,14 @@ static void pci_pm_complete(struct device *dev) if (pm_runtime_suspended(dev) && pm_resume_via_firmware()) { pci_power_t pre_sleep_state = pci_dev->current_state; - pci_update_current_state(pci_dev, pci_dev->current_state); + pci_refresh_power_state(pci_dev); + /* + * On platforms with ACPI this check may also trigger for + * devices sharing power resources if one of those power + * resources has been activated as a result of a change of the + * power state of another device sharing it. However, in that + * case it is also better to resume the device, in general. + */ if (pci_dev->current_state < pre_sleep_state) pm_request_resume(dev); } diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index e34fb2b3c466..b1f563916036 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -777,6 +777,12 @@ static inline pci_power_t platform_pci_get_power_state(struct pci_dev *dev) return pci_platform_pm ? pci_platform_pm->get_state(dev) : PCI_UNKNOWN; } +static inline void platform_pci_refresh_power_state(struct pci_dev *dev) +{ + if (pci_platform_pm && pci_platform_pm->refresh_state) + pci_platform_pm->refresh_state(dev); +} + static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev) { return pci_platform_pm ? @@ -937,6 +943,21 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state) } } +/** + * pci_refresh_power_state - Refresh the given device's power state data + * @dev: Target PCI device. + * + * Ask the platform to refresh the devices power state information and invoke + * pci_update_current_state() to update its current PCI power state. + */ +void pci_refresh_power_state(struct pci_dev *dev) +{ + if (platform_pci_power_manageable(dev)) + platform_pci_refresh_power_state(dev); + + pci_update_current_state(dev, dev->current_state); +} + /** * pci_power_up - Put the given device into D0 forcibly * @dev: PCI device to power up diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 7f9fd93270ed..5db6f985f16d 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -51,6 +51,8 @@ int pci_bus_error_reset(struct pci_dev *dev); * * @get_state: queries the platform firmware for a device's current power state * + * @refresh_state: asks the platform to refresh the device's power state data + * * @choose_state: returns PCI power state of given device preferred by the * platform; to be used during system-wide transitions from a * sleeping state to the working state and vice versa @@ -69,6 +71,7 @@ struct pci_platform_pm_ops { bool (*is_manageable)(struct pci_dev *dev); int (*set_state)(struct pci_dev *dev, pci_power_t state); pci_power_t (*get_state)(struct pci_dev *dev); + void (*refresh_state)(struct pci_dev *dev); pci_power_t (*choose_state)(struct pci_dev *dev); int (*set_wakeup)(struct pci_dev *dev, bool enable); bool (*need_resume)(struct pci_dev *dev); @@ -76,6 +79,7 @@ struct pci_platform_pm_ops { int pci_set_platform_pm(const struct pci_platform_pm_ops *ops); void pci_update_current_state(struct pci_dev *dev, pci_power_t state); +void pci_refresh_power_state(struct pci_dev *dev); void pci_power_up(struct pci_dev *dev); void pci_disable_enabled_device(struct pci_dev *dev); int pci_finish_runtime_suspend(struct pci_dev *dev);