From 4881f603d7b82df2bc15efd2a272f973a3bf8df1 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Fri, 25 Apr 2014 08:44:59 +0800 Subject: [PATCH 01/11] PM / hibernate: use unsigned local variables in swsusp_show_speed() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit do_div() needs 'u64' type, or it reports warning. And negative number is meaningless for "speed", so change all signed to unsigned within swsusp_show_speed(). The related warning (with allmodconfig for unicore32): CC kernel/power/hibernate.o kernel/power/hibernate.c: In function ‘swsusp_show_speed’: kernel/power/hibernate.c:237: warning: comparison of distinct pointer types lacks a cast Signed-off-by: Chen Gang [rjw: Subject] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f4f2073711d3..de4b989cc8fd 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -228,19 +228,23 @@ static void platform_recover(int platform_mode) void swsusp_show_speed(struct timeval *start, struct timeval *stop, unsigned nr_pages, char *msg) { - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; + u64 elapsed_centisecs64; + unsigned int centisecs; + unsigned int k; + unsigned int kps; elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + /* + * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, + * it is obvious enough for what went wrong. + */ do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); centisecs = elapsed_centisecs64; if (centisecs == 0) centisecs = 1; /* avoid div-by-zero */ k = nr_pages * (PAGE_SIZE / 1024); kps = (k * 100) / centisecs; - printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", + printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", msg, k, centisecs / 100, centisecs % 100, kps / 1000, (kps % 1000) / 10); From 52c324f8a87b336496d0f5e9d8dff1aa32bb08cd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 1 May 2014 00:13:47 +0200 Subject: [PATCH 02/11] cpuidle: Combine cpuidle_enabled() with cpuidle_select() Since both cpuidle_enabled() and cpuidle_select() are only called by cpuidle_idle_call(), it is not really useful to keep them separate and combining them will help to avoid complicating cpuidle_idle_call() even further if governors are changed to return error codes sometimes. This code modification shouldn't lead to any functional changes. Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 26 ++++++-------------------- include/linux/cpuidle.h | 5 ----- kernel/sched/idle.c | 20 +++++++------------- 3 files changed, 13 insertions(+), 38 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 8236746e46bb..f38359f64cc6 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -64,26 +64,6 @@ int cpuidle_play_dead(void) return -ENODEV; } -/** - * cpuidle_enabled - check if the cpuidle framework is ready - * @dev: cpuidle device for this cpu - * @drv: cpuidle driver for this cpu - * - * Return 0 on success, otherwise: - * -NODEV : the cpuidle framework is not available - * -EBUSY : the cpuidle framework is not initialized - */ -int cpuidle_enabled(struct cpuidle_driver *drv, struct cpuidle_device *dev) -{ - if (off || !initialized) - return -ENODEV; - - if (!drv || !dev || !dev->enabled) - return -EBUSY; - - return 0; -} - /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -138,6 +118,12 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, */ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) { + if (off || !initialized) + return -ENODEV; + + if (!drv || !dev || !dev->enabled) + return -EBUSY; + return cpuidle_curr_governor->select(drv, dev); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index b0238cba440b..a8d5bd391a26 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -120,8 +120,6 @@ struct cpuidle_driver { #ifdef CONFIG_CPU_IDLE extern void disable_cpuidle(void); -extern int cpuidle_enabled(struct cpuidle_driver *drv, - struct cpuidle_device *dev); extern int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_enter(struct cpuidle_driver *drv, @@ -149,9 +147,6 @@ extern int cpuidle_play_dead(void); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else static inline void disable_cpuidle(void) { } -static inline int cpuidle_enabled(struct cpuidle_driver *drv, - struct cpuidle_device *dev) -{return -ENODEV; } static inline int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f4390a079c7..a8f12247ce7c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -101,19 +101,13 @@ static int cpuidle_idle_call(void) rcu_idle_enter(); /* - * Check if the cpuidle framework is ready, otherwise fallback - * to the default arch specific idle method + * Ask the cpuidle framework to choose a convenient idle state. + * Fall back to the default arch specific idle method on errors. */ - ret = cpuidle_enabled(drv, dev); - - if (!ret) { - /* - * Ask the governor to choose an idle state it thinks - * it is convenient to go to. There is *always* a - * convenient idle state - */ - next_state = cpuidle_select(drv, dev); + next_state = cpuidle_select(drv, dev); + ret = next_state; + if (ret >= 0) { /* * The idle task must be scheduled, it is pointless to * go to idle, just update no idle residency and get @@ -140,7 +134,7 @@ static int cpuidle_idle_call(void) CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu); - if (!ret) { + if (ret >= 0) { trace_cpu_idle_rcuidle(next_state, dev->cpu); /* @@ -175,7 +169,7 @@ static int cpuidle_idle_call(void) * We can't use the cpuidle framework, let's use the default * idle routine */ - if (ret) + if (ret < 0) arch_cpu_idle(); __current_set_polling(); From 3836785a1bdcd6706c68ad46bf53adc0b057b310 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 1 May 2014 00:14:04 +0200 Subject: [PATCH 03/11] cpuidle / menu: Return (-1) if there are no suitable states If there is a PM QoS latency limit and all of the sufficiently shallow C-states are disabled, the cpuidle menu governor returns 0 which on some systems is CPUIDLE_DRIVER_STATE_START and shouldn't be returned if that C-state has been disabled. Fix the issue by modifying the menu governor to return (-1) in such situations. Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 71b523293354..3ca15a8cbaa8 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -296,7 +296,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->needs_update = 0; } - data->last_state_idx = 0; + data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1; /* Special case when user has set very strict latency requirement */ if (unlikely(latency_req == 0)) From 2c730785d9532d2a9c46e059bd6a6c9a764c539f Mon Sep 17 00:00:00 2001 From: Sebastian Capella Date: Mon, 21 Apr 2014 17:30:46 -0700 Subject: [PATCH 04/11] PM / hibernate: no kernel_power_off when pm_power_off NULL Reboot logic in kernel/reboot will avoid calling kernel_power_off when pm_power_off is null, and instead uses kernel_halt. Change hibernate's power_down to follow the behavior in the reboot call. Calling the notifier twice (once for SYS_POWER_OFF and again for SYS_HALT) causes a panic during hibernation on Kirkwood Openblocks A6 board. Signed-off-by: Sebastian Capella Reported-by: Ezequiel Garcia Reviewed-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index de4b989cc8fd..1f08ac7f55d8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -599,7 +599,8 @@ static void power_down(void) case HIBERNATION_PLATFORM: hibernation_platform_enter(); case HIBERNATION_SHUTDOWN: - kernel_power_off(); + if (pm_power_off) + kernel_power_off(); break; #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: @@ -627,7 +628,8 @@ static void power_down(void) * corruption after resume. */ printk(KERN_CRIT "PM: Please power down manually\n"); - while(1); + while (1) + cpu_relax(); } /** From bed4d597a0f99b380d24ab3a9da47b62cbf1ad0e Mon Sep 17 00:00:00 2001 From: Chander Kashyap Date: Tue, 22 Apr 2014 18:08:04 +0530 Subject: [PATCH 05/11] cpuidle / menu: move repeated correction factor check to init In menu_select function we check for correction factor every time. If it is zero we are initializing to unity. Hence move it to init function and initialise by unity, hence avoid repeated comparisons. Signed-off-by: Chander Kashyap Reviewed-by: Tuukka Tikkanen Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 3ca15a8cbaa8..c4f80c15a48d 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -310,13 +310,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->bucket = which_bucket(data->next_timer_us); - /* - * if the correction factor is 0 (eg first time init or cpu hotplug - * etc), we actually want to start out with a unity factor. - */ - if (data->correction_factor[data->bucket] == 0) - data->correction_factor[data->bucket] = RESOLUTION * DECAY; - /* * Force the result of multiplication to be 64 bits even if both * operands are 32 bits. @@ -466,9 +459,17 @@ static int menu_enable_device(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct menu_device *data = &per_cpu(menu_devices, dev->cpu); + int i; memset(data, 0, sizeof(struct menu_device)); + /* + * if the correction factor is 0 (eg first time init or cpu hotplug + * etc), we actually want to start out with a unity factor. + */ + for(i = 0; i < BUCKETS; i++) + data->correction_factor[i] = RESOLUTION * DECAY; + return 0; } From a6220fc19afc07fe77cfd16f5b8e568615517091 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 5 May 2014 00:51:54 +0200 Subject: [PATCH 06/11] PM / suspend: Always use deepest C-state in the "freeze" sleep state If freeze_enter() is called, we want to bypass the current cpuidle governor and always use the deepest available (that is, not disabled) C-state, because we want to save as much energy as reasonably possible then and runtime latency constraints don't matter at that point, since the system is in a sleep state anyway. Signed-off-by: Rafael J. Wysocki Tested-by: Aubrey Li --- drivers/cpuidle/cpuidle.c | 45 ++++++++++++++++++++++++++++++++++++++- include/linux/cpuidle.h | 2 ++ kernel/power/suspend.c | 2 ++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index f38359f64cc6..cb7019977c50 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -32,6 +32,7 @@ LIST_HEAD(cpuidle_detected_devices); static int enabled_devices; static int off __read_mostly; static int initialized __read_mostly; +static bool use_deepest_state __read_mostly; int cpuidle_disabled(void) { @@ -64,6 +65,45 @@ int cpuidle_play_dead(void) return -ENODEV; } +/** + * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode. + * @enable: Whether enable or disable the feature. + * + * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and + * always use the state with the greatest exit latency (out of the states that + * are not disabled). + * + * This function can only be called after cpuidle_pause() to avoid races. + */ +void cpuidle_use_deepest_state(bool enable) +{ + use_deepest_state = enable; +} + +/** + * cpuidle_find_deepest_state - Find the state of the greatest exit latency. + * @drv: cpuidle driver for a given CPU. + * @dev: cpuidle device for a given CPU. + */ +static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + unsigned int latency_req = 0; + int i, ret = CPUIDLE_DRIVER_STATE_START - 1; + + for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + struct cpuidle_state_usage *su = &dev->states_usage[i]; + + if (s->disabled || su->disable || s->exit_latency <= latency_req) + continue; + + latency_req = s->exit_latency; + ret = i; + } + return ret; +} + /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -124,6 +164,9 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (!drv || !dev || !dev->enabled) return -EBUSY; + if (unlikely(use_deepest_state)) + return cpuidle_find_deepest_state(drv, dev); + return cpuidle_curr_governor->select(drv, dev); } @@ -155,7 +198,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ void cpuidle_reflect(struct cpuidle_device *dev, int index) { - if (cpuidle_curr_governor->reflect) + if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state)) cpuidle_curr_governor->reflect(dev, index); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index a8d5bd391a26..c51a436135c4 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -143,6 +143,7 @@ extern void cpuidle_resume(void); extern int cpuidle_enable_device(struct cpuidle_device *dev); extern void cpuidle_disable_device(struct cpuidle_device *dev); extern int cpuidle_play_dead(void); +extern void cpuidle_use_deepest_state(bool enable); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else @@ -175,6 +176,7 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev) {return -ENODEV; } static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } static inline int cpuidle_play_dead(void) {return -ENODEV; } +static inline void cpuidle_use_deepest_state(bool enable) {} static inline struct cpuidle_driver *cpuidle_get_cpu_driver( struct cpuidle_device *dev) {return NULL; } #endif diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8233cd4047d7..155721f7f909 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -54,9 +54,11 @@ static void freeze_begin(void) static void freeze_enter(void) { + cpuidle_use_deepest_state(true); cpuidle_resume(); wait_event(suspend_freeze_wait_head, suspend_freeze_wake); cpuidle_pause(); + cpuidle_use_deepest_state(false); } void freeze_wake(void) From 8a54cd5bd6ebf009b96ec79510b593f7ba5c0ff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 6 May 2014 13:01:56 +0200 Subject: [PATCH 07/11] PM / hibernate: Documentation: Fix script for unswapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit System can have mmaped also character devices (e.g dri devices by X) or deleted files. Running cat on character devices is really bad idea (system can hang) so run cat only on regular files. Also mmaped files can have spaces in filenames. Signed-off-by: Pali Rohár [rjw: Subject] Signed-off-by: Rafael J. Wysocki --- Documentation/power/swsusp.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt index 079160e22bcc..f732a8321e8a 100644 --- a/Documentation/power/swsusp.txt +++ b/Documentation/power/swsusp.txt @@ -220,7 +220,10 @@ Q: After resuming, system is paging heavily, leading to very bad interactivity. A: Try running -cat `cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u` > /dev/null +cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file +do + test -f "$file" && cat "$file" > /dev/null +done after resume. swapoff -a; swapon -a may also be useful. From 317cf7e5e85e3ef9f23fc6dd8b2945ab4a258140 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 9 May 2014 23:32:08 +0200 Subject: [PATCH 08/11] PM / hibernate: convert simple_strtoul to kstrtoul Replace obsolete function. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f08ac7f55d8..2377ff72994c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1115,7 +1115,10 @@ static int __init resumewait_setup(char *str) static int __init resumedelay_setup(char *str) { - resume_delay = simple_strtoul(str, NULL, 0); + int rc = kstrtoul(str, 0, (unsigned long *)&resume_delay); + + if (rc) + return rc; return 1; } From f6514be5fe7fe796041b673bad769510414ff2b9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 May 2014 19:08:46 +0300 Subject: [PATCH 09/11] PM / hibernate: Fix memory corruption in resumedelay_setup() In the original code "resume_delay" is an int so on 64 bits, the call to kstrtoul() will cause memory corruption. We may as well fix a style issue here as well and make "resume_delay" unsigned int, since that's what we pass to ssleep(). Fixes: 317cf7e5e85e (PM / hibernate: convert simple_strtoul to kstrtoul) Signed-off-by: Dan Carpenter Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 2377ff72994c..df88d55dc436 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -35,7 +35,7 @@ static int nocompress; static int noresume; static int resume_wait; -static int resume_delay; +static unsigned int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; @@ -1115,7 +1115,7 @@ static int __init resumewait_setup(char *str) static int __init resumedelay_setup(char *str) { - int rc = kstrtoul(str, 0, (unsigned long *)&resume_delay); + int rc = kstrtouint(str, 0, &resume_delay); if (rc) return rc; From aae4518b3124b29f8dc81c829c704fd2df72e98b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 16 May 2014 02:46:50 +0200 Subject: [PATCH 10/11] PM / sleep: Mechanism to avoid resuming runtime-suspended devices unnecessarily Currently, some subsystems (e.g. PCI and the ACPI PM domain) have to resume all runtime-suspended devices during system suspend, mostly because those devices may need to be reprogrammed due to different wakeup settings for system sleep and for runtime PM. For some devices, though, it's OK to remain in runtime suspend throughout a complete system suspend/resume cycle (if the device was in runtime suspend at the start of the cycle). We would like to do this whenever possible, to avoid the overhead of extra power-up and power-down events. However, problems may arise because the device's descendants may require it to be at full power at various points during the cycle. Therefore the most straightforward way to do this safely is if the device and all its descendants can remain runtime suspended until the complete stage of system resume. To this end, introduce a new device PM flag, power.direct_complete and modify the PM core to use that flag as follows. If the ->prepare() callback of a device returns a positive number, the PM core will regard that as an indication that it may leave the device runtime-suspended. It will then check if the system power transition in progress is a suspend (and not hibernation in particular) and if the device is, indeed, runtime-suspended. In that case, the PM core will set the device's power.direct_complete flag. Otherwise it will clear power.direct_complete for the device and it also will later clear it for the device's parent (if there's one). Next, the PM core will not invoke the ->suspend() ->suspend_late(), ->suspend_irq(), ->resume_irq(), ->resume_early(), or ->resume() callbacks for all devices having power.direct_complete set. It will invoke their ->complete() callbacks, however, and those callbacks are then responsible for resuming the devices as appropriate, if necessary. For example, in some cases they may need to queue up runtime resume requests for the devices using pm_request_resume(). Changelog partly based on an Alan Stern's description of the idea (http://marc.info/?l=linux-pm&m=139940466625569&w=2). Signed-off-by: Rafael J. Wysocki Acked-by: Alan Stern --- drivers/base/power/main.c | 66 +++++++++++++++++++++++++++++--------- include/linux/pm.h | 36 ++++++++++++++++----- include/linux/pm_runtime.h | 6 ++++ 3 files changed, 85 insertions(+), 23 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 86d5e4fb5b98..343ffad59377 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -479,7 +479,7 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn TRACE_DEVICE(dev); TRACE_RESUME(0); - if (dev->power.syscore) + if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_noirq_suspended) @@ -605,7 +605,7 @@ static int device_resume_early(struct device *dev, pm_message_t state, bool asyn TRACE_DEVICE(dev); TRACE_RESUME(0); - if (dev->power.syscore) + if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_late_suspended) @@ -735,6 +735,12 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) if (dev->power.syscore) goto Complete; + if (dev->power.direct_complete) { + /* Match the pm_runtime_disable() in __device_suspend(). */ + pm_runtime_enable(dev); + goto Complete; + } + dpm_wait(dev->parent, async); dpm_watchdog_set(&wd, dev); device_lock(dev); @@ -1007,7 +1013,7 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a goto Complete; } - if (dev->power.syscore) + if (dev->power.syscore || dev->power.direct_complete) goto Complete; dpm_wait_for_children(dev, async); @@ -1146,7 +1152,7 @@ static int __device_suspend_late(struct device *dev, pm_message_t state, bool as goto Complete; } - if (dev->power.syscore) + if (dev->power.syscore || dev->power.direct_complete) goto Complete; dpm_wait_for_children(dev, async); @@ -1332,6 +1338,17 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) if (dev->power.syscore) goto Complete; + if (dev->power.direct_complete) { + if (pm_runtime_status_suspended(dev)) { + pm_runtime_disable(dev); + if (pm_runtime_suspended_if_enabled(dev)) + goto Complete; + + pm_runtime_enable(dev); + } + dev->power.direct_complete = false; + } + dpm_watchdog_set(&wd, dev); device_lock(dev); @@ -1382,10 +1399,19 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) End: if (!error) { + struct device *parent = dev->parent; + dev->power.is_suspended = true; - if (dev->power.wakeup_path - && dev->parent && !dev->parent->power.ignore_children) - dev->parent->power.wakeup_path = true; + if (parent) { + spin_lock_irq(&parent->power.lock); + + dev->parent->power.direct_complete = false; + if (dev->power.wakeup_path + && !dev->parent->power.ignore_children) + dev->parent->power.wakeup_path = true; + + spin_unlock_irq(&parent->power.lock); + } } device_unlock(dev); @@ -1487,7 +1513,7 @@ static int device_prepare(struct device *dev, pm_message_t state) { int (*callback)(struct device *) = NULL; char *info = NULL; - int error = 0; + int ret = 0; if (dev->power.syscore) return 0; @@ -1523,17 +1549,27 @@ static int device_prepare(struct device *dev, pm_message_t state) callback = dev->driver->pm->prepare; } - if (callback) { - error = callback(dev); - suspend_report_result(callback, error); - } + if (callback) + ret = callback(dev); device_unlock(dev); - if (error) + if (ret < 0) { + suspend_report_result(callback, ret); pm_runtime_put(dev); - - return error; + return ret; + } + /* + * A positive return value from ->prepare() means "this device appears + * to be runtime-suspended and its state is fine, so if it really is + * runtime-suspended, you can leave it in that state provided that you + * will do the same thing with all of its descendants". This only + * applies to suspend transitions, however. + */ + spin_lock_irq(&dev->power.lock); + dev->power.direct_complete = ret > 0 && state.event == PM_EVENT_SUSPEND; + spin_unlock_irq(&dev->power.lock); + return 0; } /** diff --git a/include/linux/pm.h b/include/linux/pm.h index d915d0345fa1..72c0fe098a27 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -93,13 +93,23 @@ typedef struct pm_message { * been registered) to recover from the race condition. * This method is executed for all kinds of suspend transitions and is * followed by one of the suspend callbacks: @suspend(), @freeze(), or - * @poweroff(). The PM core executes subsystem-level @prepare() for all - * devices before starting to invoke suspend callbacks for any of them, so - * generally devices may be assumed to be functional or to respond to - * runtime resume requests while @prepare() is being executed. However, - * device drivers may NOT assume anything about the availability of user - * space at that time and it is NOT valid to request firmware from within - * @prepare() (it's too late to do that). It also is NOT valid to allocate + * @poweroff(). If the transition is a suspend to memory or standby (that + * is, not related to hibernation), the return value of @prepare() may be + * used to indicate to the PM core to leave the device in runtime suspend + * if applicable. Namely, if @prepare() returns a positive number, the PM + * core will understand that as a declaration that the device appears to be + * runtime-suspended and it may be left in that state during the entire + * transition and during the subsequent resume if all of its descendants + * are left in runtime suspend too. If that happens, @complete() will be + * executed directly after @prepare() and it must ensure the proper + * functioning of the device after the system resume. + * The PM core executes subsystem-level @prepare() for all devices before + * starting to invoke suspend callbacks for any of them, so generally + * devices may be assumed to be functional or to respond to runtime resume + * requests while @prepare() is being executed. However, device drivers + * may NOT assume anything about the availability of user space at that + * time and it is NOT valid to request firmware from within @prepare() + * (it's too late to do that). It also is NOT valid to allocate * substantial amounts of memory from @prepare() in the GFP_KERNEL mode. * [To work around these limitations, drivers may register suspend and * hibernation notifiers to be executed before the freezing of tasks.] @@ -112,7 +122,16 @@ typedef struct pm_message { * of the other devices that the PM core has unsuccessfully attempted to * suspend earlier). * The PM core executes subsystem-level @complete() after it has executed - * the appropriate resume callbacks for all devices. + * the appropriate resume callbacks for all devices. If the corresponding + * @prepare() at the beginning of the suspend transition returned a + * positive number and the device was left in runtime suspend (without + * executing any suspend and resume callbacks for it), @complete() will be + * the only callback executed for the device during resume. In that case, + * @complete() must be prepared to do whatever is necessary to ensure the + * proper functioning of the device after the system resume. To this end, + * @complete() can check the power.direct_complete flag of the device to + * learn whether (unset) or not (set) the previous suspend and resume + * callbacks have been executed for it. * * @suspend: Executed before putting the system into a sleep state in which the * contents of main memory are preserved. The exact action to perform @@ -546,6 +565,7 @@ struct dev_pm_info { bool is_late_suspended:1; bool ignore_children:1; bool early_init:1; /* Owned by the PM core */ + bool direct_complete:1; /* Owned by the PM core */ spinlock_t lock; #ifdef CONFIG_PM_SLEEP struct list_head entry; diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 2a5897a4afbc..43fd6716f662 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -101,6 +101,11 @@ static inline bool pm_runtime_status_suspended(struct device *dev) return dev->power.runtime_status == RPM_SUSPENDED; } +static inline bool pm_runtime_suspended_if_enabled(struct device *dev) +{ + return pm_runtime_status_suspended(dev) && dev->power.disable_depth == 1; +} + static inline bool pm_runtime_enabled(struct device *dev) { return !dev->power.disable_depth; @@ -150,6 +155,7 @@ static inline void device_set_run_wake(struct device *dev, bool enable) {} static inline bool pm_runtime_suspended(struct device *dev) { return false; } static inline bool pm_runtime_active(struct device *dev) { return true; } static inline bool pm_runtime_status_suspended(struct device *dev) { return false; } +static inline bool pm_runtime_suspended_if_enabled(struct device *dev) { return false; } static inline bool pm_runtime_enabled(struct device *dev) { return false; } static inline void pm_runtime_no_callbacks(struct device *dev) {} From f71495f3f0c5f0801823d1235b271a4a415d3df8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 16 May 2014 02:47:37 +0200 Subject: [PATCH 11/11] PM / sleep: Update device PM documentation to cover direct_complete Update the device PM documentation in devices.txt and runtime_pm.txt to reflect the changes in the system suspend and resume handling related to the introduction of the new power.direct_complete flag. Signed-off-by: Rafael J. Wysocki Acked-by: Alan Stern --- Documentation/power/devices.txt | 34 ++++++++++++++++++++++++++---- Documentation/power/runtime_pm.txt | 17 +++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index 47d46dff70f7..d172bce0fd49 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt @@ -2,6 +2,7 @@ Device Power Management Copyright (c) 2010-2011 Rafael J. Wysocki , Novell Inc. Copyright (c) 2010 Alan Stern +Copyright (c) 2014 Intel Corp., Rafael J. Wysocki Most of the code in Linux is device drivers, so most of the Linux power @@ -326,6 +327,20 @@ the phases are: driver in some way for the upcoming system power transition, but it should not put the device into a low-power state. + For devices supporting runtime power management, the return value of the + prepare callback can be used to indicate to the PM core that it may + safely leave the device in runtime suspend (if runtime-suspended + already), provided that all of the device's descendants are also left in + runtime suspend. Namely, if the prepare callback returns a positive + number and that happens for all of the descendants of the device too, + and all of them (including the device itself) are runtime-suspended, the + PM core will skip the suspend, suspend_late and suspend_noirq suspend + phases as well as the resume_noirq, resume_early and resume phases of + the following system resume for all of these devices. In that case, + the complete callback will be called directly after the prepare callback + and is entirely responsible for bringing the device back to the + functional state as appropriate. + 2. The suspend methods should quiesce the device to stop it from performing I/O. They also may save the device registers and put it into the appropriate low-power state, depending on the bus type the device is on, @@ -400,12 +415,23 @@ When resuming from freeze, standby or memory sleep, the phases are: the resume callbacks occur; it's not necessary to wait until the complete phase. + Moreover, if the preceding prepare callback returned a positive number, + the device may have been left in runtime suspend throughout the whole + system suspend and resume (the suspend, suspend_late, suspend_noirq + phases of system suspend and the resume_noirq, resume_early, resume + phases of system resume may have been skipped for it). In that case, + the complete callback is entirely responsible for bringing the device + back to the functional state after system suspend if necessary. [For + example, it may need to queue up a runtime resume request for the device + for this purpose.] To check if that is the case, the complete callback + can consult the device's power.direct_complete flag. Namely, if that + flag is set when the complete callback is being run, it has been called + directly after the preceding prepare and special action may be required + to make the device work correctly afterward. + At the end of these phases, drivers should be as functional as they were before suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are -gated on. Even if the device was in a low-power state before the system sleep -because of runtime power management, afterwards it should be back in its -full-power state. There are multiple reasons why it's best to do this; they are -discussed in more detail in Documentation/power/runtime_pm.txt. +gated on. However, the details here may again be platform-specific. For example, some systems support multiple "run" states, and the mode in effect at diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt index 5f96daf8566a..e1bee8a4aaac 100644 --- a/Documentation/power/runtime_pm.txt +++ b/Documentation/power/runtime_pm.txt @@ -2,6 +2,7 @@ Runtime Power Management Framework for I/O Devices (C) 2009-2011 Rafael J. Wysocki , Novell Inc. (C) 2010 Alan Stern +(C) 2014 Intel Corp., Rafael J. Wysocki 1. Introduction @@ -444,6 +445,10 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: bool pm_runtime_status_suspended(struct device *dev); - return true if the device's runtime PM status is 'suspended' + bool pm_runtime_suspended_if_enabled(struct device *dev); + - return true if the device's runtime PM status is 'suspended' and its + 'power.disable_depth' field is equal to 1 + void pm_runtime_allow(struct device *dev); - set the power.runtime_auto flag for the device and decrease its usage counter (used by the /sys/devices/.../power/control interface to @@ -644,6 +649,18 @@ place (in particular, if the system is not waking up from hibernation), it may be more efficient to leave the devices that had been suspended before the system suspend began in the suspended state. +To this end, the PM core provides a mechanism allowing some coordination between +different levels of device hierarchy. Namely, if a system suspend .prepare() +callback returns a positive number for a device, that indicates to the PM core +that the device appears to be runtime-suspended and its state is fine, so it +may be left in runtime suspend provided that all of its descendants are also +left in runtime suspend. If that happens, the PM core will not execute any +system suspend and resume callbacks for all of those devices, except for the +complete callback, which is then entirely responsible for handling the device +as appropriate. This only applies to system suspend transitions that are not +related to hibernation (see Documentation/power/devices.txt for more +information). + The PM core does its best to reduce the probability of race conditions between the runtime PM and system suspend/resume (and hibernation) callbacks by carrying out the following operations: