From 6b2c676bf32be91f43215d5874c07c1becaba013 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 11 May 2010 16:50:52 -0400 Subject: [PATCH 1/8] cpuidle: fail to register if !CONFIG_CPU_IDLE Signed-off-by: Len Brown --- include/linux/cpuidle.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index dcf77fa826b5..f5e6480dc16f 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -137,16 +137,16 @@ extern void cpuidle_disable_device(struct cpuidle_device *dev); #else static inline int cpuidle_register_driver(struct cpuidle_driver *drv) -{return 0;} +{return -ENODEV; } static inline void cpuidle_unregister_driver(struct cpuidle_driver *drv) { } static inline int cpuidle_register_device(struct cpuidle_device *dev) -{return 0;} +{return -ENODEV; } static inline void cpuidle_unregister_device(struct cpuidle_device *dev) { } static inline void cpuidle_pause_and_lock(void) { } static inline void cpuidle_resume_and_unlock(void) { } static inline int cpuidle_enable_device(struct cpuidle_device *dev) -{return 0;} +{return -ENODEV; } static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } #endif From c0d64cb031c21f163a0ec15cf10844bcf0ceedcf Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 22 May 2010 16:34:10 -0400 Subject: [PATCH 2/8] cpuidle: add cpuidle_unregister_driver() error check Assure that cpuidle_unregister_driver() will not clobber the registered driver if unregistered by somebody else. Signed-off-by: Len Brown --- drivers/cpuidle/driver.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 2257004fe33d..826b5c0aa12b 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -45,8 +45,11 @@ EXPORT_SYMBOL_GPL(cpuidle_register_driver); */ void cpuidle_unregister_driver(struct cpuidle_driver *drv) { - if (!drv) + if (drv != cpuidle_curr_driver) { + WARN(1, "invalid cpuidle_unregister_driver(%s)\n", + drv->name); return; + } spin_lock(&cpuidle_driver_lock); cpuidle_curr_driver = NULL; From 752138df0dc2daaae09379c754caeb08c97905dc Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 22 May 2010 16:57:26 -0400 Subject: [PATCH 3/8] cpuidle: make cpuidle_curr_driver static cpuidle_register_driver() sets cpuidle_curr_driver cpuidle_unregister_driver() clears cpuidle_curr_driver We should't expose cpuidle_curr_driver to potential modification except via these interfaces. So make it static and create cpuidle_get_driver() to observe it. Signed-off-by: Len Brown --- drivers/cpuidle/cpuidle.c | 12 +++++++----- drivers/cpuidle/cpuidle.h | 1 - drivers/cpuidle/driver.c | 11 ++++++++++- drivers/cpuidle/sysfs.c | 5 +++-- include/linux/cpuidle.h | 2 ++ 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 12fdd3987a36..199488576a05 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -156,7 +156,7 @@ int cpuidle_enable_device(struct cpuidle_device *dev) if (dev->enabled) return 0; - if (!cpuidle_curr_driver || !cpuidle_curr_governor) + if (!cpuidle_get_driver() || !cpuidle_curr_governor) return -EIO; if (!dev->state_count) return -EINVAL; @@ -207,7 +207,7 @@ void cpuidle_disable_device(struct cpuidle_device *dev) { if (!dev->enabled) return; - if (!cpuidle_curr_driver || !cpuidle_curr_governor) + if (!cpuidle_get_driver() || !cpuidle_curr_governor) return; dev->enabled = 0; @@ -271,10 +271,11 @@ static int __cpuidle_register_device(struct cpuidle_device *dev) { int ret; struct sys_device *sys_dev = get_cpu_sysdev((unsigned long)dev->cpu); + struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver(); if (!sys_dev) return -EINVAL; - if (!try_module_get(cpuidle_curr_driver->owner)) + if (!try_module_get(cpuidle_driver->owner)) return -EINVAL; init_completion(&dev->kobj_unregister); @@ -284,7 +285,7 @@ static int __cpuidle_register_device(struct cpuidle_device *dev) per_cpu(cpuidle_devices, dev->cpu) = dev; list_add(&dev->device_list, &cpuidle_detected_devices); if ((ret = cpuidle_add_sysfs(sys_dev))) { - module_put(cpuidle_curr_driver->owner); + module_put(cpuidle_driver->owner); return ret; } @@ -325,6 +326,7 @@ EXPORT_SYMBOL_GPL(cpuidle_register_device); void cpuidle_unregister_device(struct cpuidle_device *dev) { struct sys_device *sys_dev = get_cpu_sysdev((unsigned long)dev->cpu); + struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver(); if (dev->registered == 0) return; @@ -340,7 +342,7 @@ void cpuidle_unregister_device(struct cpuidle_device *dev) cpuidle_resume_and_unlock(); - module_put(cpuidle_curr_driver->owner); + module_put(cpuidle_driver->owner); } EXPORT_SYMBOL_GPL(cpuidle_unregister_device); diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h index 9476ba33ee2c..33e50d556f17 100644 --- a/drivers/cpuidle/cpuidle.h +++ b/drivers/cpuidle/cpuidle.h @@ -9,7 +9,6 @@ /* For internal use only */ extern struct cpuidle_governor *cpuidle_curr_governor; -extern struct cpuidle_driver *cpuidle_curr_driver; extern struct list_head cpuidle_governors; extern struct list_head cpuidle_detected_devices; extern struct mutex cpuidle_lock; diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 826b5c0aa12b..fd1601e3d125 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -14,7 +14,7 @@ #include "cpuidle.h" -struct cpuidle_driver *cpuidle_curr_driver; +static struct cpuidle_driver *cpuidle_curr_driver; DEFINE_SPINLOCK(cpuidle_driver_lock); /** @@ -39,6 +39,15 @@ int cpuidle_register_driver(struct cpuidle_driver *drv) EXPORT_SYMBOL_GPL(cpuidle_register_driver); +/** + * cpuidle_get_driver - return the current driver + */ +struct cpuidle_driver *cpuidle_get_driver(void) +{ + return cpuidle_curr_driver; +} +EXPORT_SYMBOL_GPL(cpuidle_get_driver); + /** * cpuidle_unregister_driver - unregisters a driver * @drv: the driver diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 0ba9c8b8ee74..0310ffaec9df 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -47,10 +47,11 @@ static ssize_t show_current_driver(struct sysdev_class *class, char *buf) { ssize_t ret; + struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver(); spin_lock(&cpuidle_driver_lock); - if (cpuidle_curr_driver) - ret = sprintf(buf, "%s\n", cpuidle_curr_driver->name); + if (cpuidle_driver) + ret = sprintf(buf, "%s\n", cpuidle_driver->name); else ret = sprintf(buf, "none\n"); spin_unlock(&cpuidle_driver_lock); diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index f5e6480dc16f..55215cce5005 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -125,6 +125,7 @@ struct cpuidle_driver { #ifdef CONFIG_CPU_IDLE extern int cpuidle_register_driver(struct cpuidle_driver *drv); +struct cpuidle_driver *cpuidle_get_driver(void); extern void cpuidle_unregister_driver(struct cpuidle_driver *drv); extern int cpuidle_register_device(struct cpuidle_device *dev); extern void cpuidle_unregister_device(struct cpuidle_device *dev); @@ -138,6 +139,7 @@ extern void cpuidle_disable_device(struct cpuidle_device *dev); static inline int cpuidle_register_driver(struct cpuidle_driver *drv) {return -ENODEV; } +static inline struct cpuidle_driver *cpuidle_get_driver(void) {return NULL; } static inline void cpuidle_unregister_driver(struct cpuidle_driver *drv) { } static inline int cpuidle_register_device(struct cpuidle_device *dev) {return -ENODEV; } From 541adf7cd937b3895c7645406a9b060504df453b Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 22 May 2010 17:03:29 -0400 Subject: [PATCH 4/8] ACPI: allow a native cpuidle driver to displace ACPI The ACPI driver would fail probe when it found that another driver had previously registered with cpuidle. But this is a natural situation, as a native hardware cpuidle driver should be able to bind instead of ACPI, and the ACPI processor driver should be able to handle yielding control of C-states while still handling P-states and T-states. Add a KERN_DEBUG line showing when acpi_idle does successfully register. Signed-off-by: Len Brown --- drivers/acpi/processor_driver.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 5675d9747e87..deefa8591746 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -616,7 +616,8 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device) acpi_processor_get_limit_info(pr); - acpi_processor_power_init(pr, device); + if (cpuidle_get_driver() == &acpi_idle_driver) + acpi_processor_power_init(pr, device); pr->cdev = thermal_cooling_device_register("Processor", device, &processor_cooling_ops); @@ -920,9 +921,10 @@ static int __init acpi_processor_init(void) if (!acpi_processor_dir) return -ENOMEM; #endif - result = cpuidle_register_driver(&acpi_idle_driver); - if (result < 0) - goto out_proc; + + if (!cpuidle_register_driver(&acpi_idle_driver)) + printk(KERN_DEBUG "ACPI: %s registered with cpuidle\n", + acpi_idle_driver.name); result = acpi_bus_register_driver(&acpi_processor_driver); if (result < 0) @@ -941,7 +943,6 @@ static int __init acpi_processor_init(void) out_cpuidle: cpuidle_unregister_driver(&acpi_idle_driver); -out_proc: #ifdef CONFIG_ACPI_PROCFS remove_proc_entry(ACPI_PROCESSOR_CLASS, acpi_root_dir); #endif From e45b7fa23097332508730123ac6d59227e7bd7f8 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 24 May 2010 11:34:36 -0400 Subject: [PATCH 5/8] sched: clarify commment for TS_POLLING TS_POLLING set tells the scheduler an idle_task will poll need_resched() to look for work. TS_POLLING clear tells resched_task() and wake_up_idle_cpu() that the remote CPU's idle_task is now sleeping in idle, and thus requires a reschedule interrupt notice work. Update the description of TS_POLLING to reflect how it works. "idle task polling need_resched, skip sending interrupt" Wordsmithing-by: Milton Miller Signed-off-by: Len Brown Acked-by: Peter Zijlstra --- arch/x86/include/asm/thread_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e0d28901e969..812186919bba 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -241,8 +241,8 @@ static inline struct thread_info *current_thread_info(void) #define TS_USEDFPU 0x0001 /* FPU was used by this task this quantum (SMP) */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ -#define TS_POLLING 0x0004 /* true if in idle loop - and not sleeping */ +#define TS_POLLING 0x0004 /* idle task polling need_resched, + skip sending interrupt */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ #define TS_XSAVE 0x0010 /* Use xsave/xrstor */ From a7d27c37533524e23bbdc9ba1d78e3241b7483ea Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 24 May 2010 11:41:00 -0400 Subject: [PATCH 6/8] acpi_pad: uses MONITOR/MWAIT, so it doesn't need to clear TS_POLLING api_pad exclusively uses MONITOR/MWAIT to sleep in idle, so it does not need the wakeup IPI during idle sleep that is provoked by clearing TS_POLLING. Signed-off-by: Len Brown Cc: Shaohua Li --- drivers/acpi/acpi_pad.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index 62122134693b..7edf053983c5 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -168,13 +168,6 @@ static int power_saving_thread(void *data) do_sleep = 0; - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we test - * NEED_RESCHED: - */ - smp_mb(); - expire_time = jiffies + HZ * (100 - idle_pct) / 100; while (!need_resched()) { @@ -200,8 +193,6 @@ static int power_saving_thread(void *data) } } - current_thread_info()->status |= TS_POLLING; - /* * current sched_rt has threshold for rt task running time. * When a rt task uses 95% CPU time, the rt thread will be From 02cf4f9808382af7265cafc33dc86ec5875526aa Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 24 May 2010 14:27:44 -0400 Subject: [PATCH 7/8] ACPI: acpi_idle: touch TS_POLLING only in the non-MWAIT case commit d306ebc28649b89877a22158fe0076f06cc46f60 (ACPI: Be in TS_POLLING state during mwait based C-state entry) fixed an important power & performance issue where ACPI c2 and c3 C-states were clearing TS_POLLING even when using MWAIT (ACPI_STATE_FFH). That bug had been causing us to receive redundant scheduling interrups when we had already been woken up by MONITOR/MWAIT. Following up on that... In the MWAIT case, we don't have to subsequently check need_resched(), as that c heck was there for the TS_POLLING-clearing case. Note that not only does the cpuidle calling function already check need_resched() before calling us, the low-level entry into monitor/mwait calls it twice -- guaranteeing that a write to the trigger address can not go un-noticed. Also, in this case, we don't have to set TS_POLLING when we wake, because we never cleared it. Signed-off-by: Len Brown Acked-by: Venkatesh Pallipadi --- drivers/acpi/processor_idle.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 5939e7f7d8e9..a4166e2abb92 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -881,6 +881,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, return(acpi_idle_enter_c1(dev, state)); local_irq_disable(); + if (cx->entry_method != ACPI_CSTATE_FFH) { current_thread_info()->status &= ~TS_POLLING; /* @@ -888,12 +889,12 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, * NEED_RESCHED: */ smp_mb(); - } - if (unlikely(need_resched())) { - current_thread_info()->status |= TS_POLLING; - local_irq_enable(); - return 0; + if (unlikely(need_resched())) { + current_thread_info()->status |= TS_POLLING; + local_irq_enable(); + return 0; + } } /* @@ -918,7 +919,8 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); local_irq_enable(); - current_thread_info()->status |= TS_POLLING; + if (cx->entry_method != ACPI_CSTATE_FFH) + current_thread_info()->status |= TS_POLLING; cx->usage++; @@ -968,6 +970,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, } local_irq_disable(); + if (cx->entry_method != ACPI_CSTATE_FFH) { current_thread_info()->status &= ~TS_POLLING; /* @@ -975,12 +978,12 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, * NEED_RESCHED: */ smp_mb(); - } - if (unlikely(need_resched())) { - current_thread_info()->status |= TS_POLLING; - local_irq_enable(); - return 0; + if (unlikely(need_resched())) { + current_thread_info()->status |= TS_POLLING; + local_irq_enable(); + return 0; + } } acpi_unlazy_tlb(smp_processor_id()); @@ -1032,7 +1035,8 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); local_irq_enable(); - current_thread_info()->status |= TS_POLLING; + if (cx->entry_method != ACPI_CSTATE_FFH) + current_thread_info()->status |= TS_POLLING; cx->usage++; From 2671717265ae6e720a9ba5f13fbec3a718983b65 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 8 Mar 2010 14:07:30 -0500 Subject: [PATCH 8/8] intel_idle: native hardware cpuidle driver for latest Intel processors This EXPERIMENTAL driver supersedes acpi_idle on Intel Atom Processors, Intel Core i3/i5/i7 Processors and associated Intel Xeon processors. It does not support the Intel Core2 processor or earlier. For kernels configured with ACPI, CONFIG_INTEL_IDLE=y allows intel_idle to probe before the ACPI processor driver. Booting with "intel_idle.max_cstate=0" disables intel_idle and the system will fall back on ACPI's "acpi_idle". Typical Linux distributions load ACPI processor module early, making CONFIG_INTEL_IDLE=m not easily useful on ACPI platforms. intel_idle probes all processors at module_init time. Processors that are hot-added later will be limited to using C1 in idle. Signed-off-by: Len Brown --- MAINTAINERS | 7 + drivers/Makefile | 2 +- drivers/acpi/processor_driver.c | 6 +- drivers/idle/Kconfig | 11 + drivers/idle/Makefile | 1 + drivers/idle/intel_idle.c | 461 ++++++++++++++++++++++++++++++++ 6 files changed, 486 insertions(+), 2 deletions(-) create mode 100755 drivers/idle/intel_idle.c diff --git a/MAINTAINERS b/MAINTAINERS index d329b053a718..276e79bb462c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2850,6 +2850,13 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git S: Maintained F: drivers/input/ +INTEL IDLE DRIVER +M: Len Brown +L: linux-pm@lists.linux-foundation.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux-idle-2.6.git +S: Supported +F: drivers/idle/intel_idle.c + INTEL FRAMEBUFFER DRIVER (excluding 810 and 815) M: Maik Broemme L: linux-fbdev@vger.kernel.org diff --git a/drivers/Makefile b/drivers/Makefile index f42a03029b7c..91874e048552 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_PCI) += pci/ obj-$(CONFIG_PARISC) += parisc/ obj-$(CONFIG_RAPIDIO) += rapidio/ obj-y += video/ +obj-y += idle/ obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_SFI) += sfi/ # PnP must come after ACPI since it will eventually need to check if acpi @@ -91,7 +92,6 @@ obj-$(CONFIG_EISA) += eisa/ obj-y += lguest/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_CPU_IDLE) += cpuidle/ -obj-y += idle/ obj-$(CONFIG_MMC) += mmc/ obj-$(CONFIG_MEMSTICK) += memstick/ obj-$(CONFIG_NEW_LEDS) += leds/ diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index deefa8591746..b1034a9ada4e 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -922,9 +922,13 @@ static int __init acpi_processor_init(void) return -ENOMEM; #endif - if (!cpuidle_register_driver(&acpi_idle_driver)) + if (!cpuidle_register_driver(&acpi_idle_driver)) { printk(KERN_DEBUG "ACPI: %s registered with cpuidle\n", acpi_idle_driver.name); + } else { + printk(KERN_DEBUG "ACPI: acpi_idle yielding to %s", + cpuidle_get_driver()->name); + } result = acpi_bus_register_driver(&acpi_processor_driver); if (result < 0) diff --git a/drivers/idle/Kconfig b/drivers/idle/Kconfig index f15e90a453d1..fb5c5186d4aa 100644 --- a/drivers/idle/Kconfig +++ b/drivers/idle/Kconfig @@ -1,3 +1,14 @@ +config INTEL_IDLE + tristate "Cpuidle Driver for Intel Processors" + depends on CPU_IDLE + depends on X86 + depends on CPU_SUP_INTEL + depends on EXPERIMENTAL + help + Enable intel_idle, a cpuidle driver that includes knowledge of + native Intel hardware idle features. The acpi_idle driver + can be configured at the same time, in order to handle + processors intel_idle does not support. menu "Memory power savings" depends on X86_64 diff --git a/drivers/idle/Makefile b/drivers/idle/Makefile index 5f68fc377e21..23d295cf10f2 100644 --- a/drivers/idle/Makefile +++ b/drivers/idle/Makefile @@ -1,2 +1,3 @@ obj-$(CONFIG_I7300_IDLE) += i7300_idle.o +obj-$(CONFIG_INTEL_IDLE) += intel_idle.o diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c new file mode 100755 index 000000000000..54f0fb4cd5d2 --- /dev/null +++ b/drivers/idle/intel_idle.c @@ -0,0 +1,461 @@ +/* + * intel_idle.c - native hardware idle loop for modern Intel processors + * + * Copyright (c) 2010, Intel Corporation. + * Len Brown + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * intel_idle is a cpuidle driver that loads on specific Intel processors + * in lieu of the legacy ACPI processor_idle driver. The intent is to + * make Linux more efficient on these processors, as intel_idle knows + * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs. + */ + +/* + * Design Assumptions + * + * All CPUs have same idle states as boot CPU + * + * Chipset BM_STS (bus master status) bit is a NOP + * for preventing entry into deep C-stats + */ + +/* + * Known limitations + * + * The driver currently initializes for_each_online_cpu() upon modprobe. + * It it unaware of subsequent processors hot-added to the system. + * This means that if you boot with maxcpus=n and later online + * processors above n, those processors will use C1 only. + * + * ACPI has a .suspend hack to turn off deep c-statees during suspend + * to avoid complications with the lapic timer workaround. + * Have not seen issues with suspend, but may need same workaround here. + * + * There is currently no kernel-based automatic probing/loading mechanism + * if the driver is built as a module. + */ + +/* un-comment DEBUG to enable pr_debug() statements */ +#define DEBUG + +#include +#include +#include +#include /* ktime_get_real() */ +#include +#include + +#define INTEL_IDLE_VERSION "0.4" +#define PREFIX "intel_idle: " + +#define MWAIT_SUBSTATE_MASK (0xf) +#define MWAIT_CSTATE_MASK (0xf) +#define MWAIT_SUBSTATE_SIZE (4) +#define MWAIT_MAX_NUM_CSTATES 8 +#define CPUID_MWAIT_LEAF (5) +#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1) +#define CPUID5_ECX_INTERRUPT_BREAK (0x2) + +static struct cpuidle_driver intel_idle_driver = { + .name = "intel_idle", + .owner = THIS_MODULE, +}; +/* intel_idle.max_cstate=0 disables driver */ +static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1; +static int power_policy = 7; /* 0 = max perf; 15 = max powersave */ + +static unsigned int substates; +static int (*choose_substate)(int); + +/* Reliable LAPIC Timer States, bit 1 for C1 etc. */ +static unsigned int lapic_timer_reliable_states; + +static struct cpuidle_device *intel_idle_cpuidle_devices; +static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state); + +static struct cpuidle_state *cpuidle_state_table; + +/* + * States are indexed by the cstate number, + * which is also the index into the MWAIT hint array. + * Thus C0 is a dummy. + */ +static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = { + { /* MWAIT C0 */ }, + { /* MWAIT C1 */ + .name = "NHM-C1", + .desc = "MWAIT 0x00", + .driver_data = (void *) 0x00, + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 3, + .power_usage = 1000, + .target_residency = 6, + .enter = &intel_idle }, + { /* MWAIT C2 */ + .name = "NHM-C3", + .desc = "MWAIT 0x10", + .driver_data = (void *) 0x10, + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 20, + .power_usage = 500, + .target_residency = 80, + .enter = &intel_idle }, + { /* MWAIT C3 */ + .name = "NHM-C6", + .desc = "MWAIT 0x20", + .driver_data = (void *) 0x20, + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 200, + .power_usage = 350, + .target_residency = 800, + .enter = &intel_idle }, +}; + +static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = { + { /* MWAIT C0 */ }, + { /* MWAIT C1 */ + .name = "ATM-C1", + .desc = "MWAIT 0x00", + .driver_data = (void *) 0x00, + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 1, + .power_usage = 1000, + .target_residency = 4, + .enter = &intel_idle }, + { /* MWAIT C2 */ + .name = "ATM-C2", + .desc = "MWAIT 0x10", + .driver_data = (void *) 0x10, + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 20, + .power_usage = 500, + .target_residency = 80, + .enter = &intel_idle }, + { /* MWAIT C3 */ }, + { /* MWAIT C4 */ + .name = "ATM-C4", + .desc = "MWAIT 0x30", + .driver_data = (void *) 0x30, + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 100, + .power_usage = 250, + .target_residency = 400, + .enter = &intel_idle }, + { /* MWAIT C5 */ }, + { /* MWAIT C6 */ + .name = "ATM-C6", + .desc = "MWAIT 0x40", + .driver_data = (void *) 0x40, + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 200, + .power_usage = 150, + .target_residency = 800, + .enter = NULL }, /* disabled */ +}; + +/* + * choose_tunable_substate() + * + * Run-time decision on which C-state substate to invoke + * If power_policy = 0, choose shallowest substate (0) + * If power_policy = 15, choose deepest substate + * If power_policy = middle, choose middle substate etc. + */ +static int choose_tunable_substate(int cstate) +{ + unsigned int num_substates; + unsigned int substate_choice; + + power_policy &= 0xF; /* valid range: 0-15 */ + cstate &= 7; /* valid range: 0-7 */ + + num_substates = (substates >> ((cstate) * 4)) & MWAIT_SUBSTATE_MASK; + + if (num_substates <= 1) + return 0; + + substate_choice = ((power_policy + (power_policy + 1) * + (num_substates - 1)) / 16); + + return substate_choice; +} + +/* + * choose_zero_substate() + */ +static int choose_zero_substate(int cstate) +{ + return 0; +} + +/** + * intel_idle + * @dev: cpuidle_device + * @state: cpuidle state + * + */ +static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state) +{ + unsigned long ecx = 1; /* break on interrupt flag */ + unsigned long eax = (unsigned long)cpuidle_get_statedata(state); + unsigned int cstate; + ktime_t kt_before, kt_after; + s64 usec_delta; + int cpu = smp_processor_id(); + + cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; + + eax = eax + (choose_substate)(cstate); + + local_irq_disable(); + + if (!(lapic_timer_reliable_states & (1 << (cstate)))) + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); + + kt_before = ktime_get_real(); + + stop_critical_timings(); +#ifndef MODULE + trace_power_start(POWER_CSTATE, (eax >> 4) + 1); +#endif + if (!need_resched()) { + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __mwait(eax, ecx); + } + + start_critical_timings(); + + kt_after = ktime_get_real(); + usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before)); + + local_irq_enable(); + + if (!(lapic_timer_reliable_states & (1 << (cstate)))) + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + + return usec_delta; +} + +/* + * intel_idle_probe() + */ +static int intel_idle_probe(void) +{ + unsigned int eax, ebx, ecx, edx; + + if (max_cstate == 0) { + pr_debug(PREFIX "disabled\n"); + return -EPERM; + } + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -ENODEV; + + if (!boot_cpu_has(X86_FEATURE_MWAIT)) + return -ENODEV; + + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return -ENODEV; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || + !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) + return -ENODEV; +#ifdef DEBUG + if (substates == 0) /* can over-ride via modparam */ +#endif + substates = edx; + + pr_debug(PREFIX "MWAIT substates: 0x%x\n", substates); + + if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ + lapic_timer_reliable_states = 0xFFFFFFFF; + + if (boot_cpu_data.x86 != 6) /* family 6 */ + return -ENODEV; + + switch (boot_cpu_data.x86_model) { + + case 0x1A: /* Core i7, Xeon 5500 series */ + case 0x1E: /* Core i7 and i5 Processor - Lynnfield Jasper Forest */ + case 0x1F: /* Core i7 and i5 Processor - Nehalem */ + case 0x2E: /* Nehalem-EX Xeon */ + lapic_timer_reliable_states = (1 << 1); /* C1 */ + + case 0x25: /* Westmere */ + case 0x2C: /* Westmere */ + cpuidle_state_table = nehalem_cstates; + choose_substate = choose_tunable_substate; + break; + + case 0x1C: /* 28 - Atom Processor */ + lapic_timer_reliable_states = (1 << 2) | (1 << 1); /* C2, C1 */ + cpuidle_state_table = atom_cstates; + choose_substate = choose_zero_substate; + break; +#ifdef FUTURE_USE + case 0x17: /* 23 - Core 2 Duo */ + lapic_timer_reliable_states = (1 << 2) | (1 << 1); /* C2, C1 */ +#endif + + default: + pr_debug(PREFIX "does not run on family %d model %d\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + return -ENODEV; + } + + pr_debug(PREFIX "v" INTEL_IDLE_VERSION + " model 0x%X\n", boot_cpu_data.x86_model); + + pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", + lapic_timer_reliable_states); + return 0; +} + +/* + * intel_idle_cpuidle_devices_uninit() + * unregister, free cpuidle_devices + */ +static void intel_idle_cpuidle_devices_uninit(void) +{ + int i; + struct cpuidle_device *dev; + + for_each_online_cpu(i) { + dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); + cpuidle_unregister_device(dev); + } + + free_percpu(intel_idle_cpuidle_devices); + return; +} +/* + * intel_idle_cpuidle_devices_init() + * allocate, initialize, register cpuidle_devices + */ +static int intel_idle_cpuidle_devices_init(void) +{ + int i, cstate; + struct cpuidle_device *dev; + + intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); + if (intel_idle_cpuidle_devices == NULL) + return -ENOMEM; + + for_each_online_cpu(i) { + dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); + + dev->state_count = 1; + + for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { + int num_substates; + + if (cstate > max_cstate) { + printk(PREFIX "max_cstate %d reached\n", + max_cstate); + break; + } + + /* does the state exist in CPUID.MWAIT? */ + num_substates = (substates >> ((cstate) * 4)) + & MWAIT_SUBSTATE_MASK; + if (num_substates == 0) + continue; + /* is the state not enabled? */ + if (cpuidle_state_table[cstate].enter == NULL) { + /* does the driver not know about the state? */ + if (*cpuidle_state_table[cstate].name == '\0') + pr_debug(PREFIX "unaware of model 0x%x" + " MWAIT %d please" + " contact lenb@kernel.org", + boot_cpu_data.x86_model, cstate); + continue; + } + + if ((cstate > 2) && + !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halts in idle" + " states deeper than C2"); + + dev->states[dev->state_count] = /* structure copy */ + cpuidle_state_table[cstate]; + + dev->state_count += 1; + } + + dev->cpu = i; + if (cpuidle_register_device(dev)) { + pr_debug(PREFIX "cpuidle_register_device %d failed!\n", + i); + intel_idle_cpuidle_devices_uninit(); + return -EIO; + } + } + + return 0; +} + + +static int __init intel_idle_init(void) +{ + int retval; + + retval = intel_idle_probe(); + if (retval) + return retval; + + retval = cpuidle_register_driver(&intel_idle_driver); + if (retval) { + printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", + cpuidle_get_driver()->name); + return retval; + } + + retval = intel_idle_cpuidle_devices_init(); + if (retval) { + cpuidle_unregister_driver(&intel_idle_driver); + return retval; + } + + return 0; +} + +static void __exit intel_idle_exit(void) +{ + intel_idle_cpuidle_devices_uninit(); + cpuidle_unregister_driver(&intel_idle_driver); + + return; +} + +module_init(intel_idle_init); +module_exit(intel_idle_exit); + +module_param(power_policy, int, 0644); +module_param(max_cstate, int, 0444); +#ifdef DEBUG +module_param(substates, int, 0444); +#endif + +MODULE_AUTHOR("Len Brown "); +MODULE_DESCRIPTION("Cpuidle driver for Intel Hardware v" INTEL_IDLE_VERSION); +MODULE_LICENSE("GPL");