From a831881be220358a1d28c5d95d69449fb6d623ca Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 18 Dec 2012 17:32:19 +0100 Subject: [PATCH 01/39] nohz: Basic full dynticks interface For extreme usecases such as Real Time or HPC, having the ability to shutdown the tick when a single task runs on a CPU is a desired feature: * Reducing the amount of interrupts improves throughput for CPU-bound tasks. The CPU is less distracted from its real job, from an execution time and from the cache point of views. * This also improve latency response as we have less critical sections. Start with introducing a very simple interface to define full dynticks CPU: use a boot time option defined cpumask through the "nohz_extended=" kernel parameter. CPUs that are part of this range will have their tick shutdown whenever possible: provided they run a single task and they don't do kernel activity that require the periodic tick. These details will be later documented in Documentation/* An online CPU must be kept outside this range to handle the timekeeping. Suggested-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- Documentation/kernel-parameters.txt | 6 +++ include/linux/tick.h | 7 ++++ kernel/time/Kconfig | 19 +++++++++ kernel/time/tick-sched.c | 62 +++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4609e81dbc37..231698feaddc 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1913,6 +1913,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Valid arguments: on, off Default: on + nohz_extended= [KNL,BOOT] + In kernels built with CONFIG_NO_HZ_EXTENDED=y, set + the specified list of CPUs whose tick will be stopped + whenever possible. You need to keep at least one online + CPU outside the range to maintain the timekeeping. + noiotrap [SH] Disables trapped I/O port accesses. noirqdebug [X86-32] Disables the code which attempts to detect and diff --git a/include/linux/tick.h b/include/linux/tick.h index 553272e6af55..44bfa8aa439f 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -157,6 +157,13 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !NO_HZ */ +#ifdef CONFIG_NO_HZ_EXTENDED +extern int tick_nohz_extended_cpu(int cpu); +#else +static inline int tick_nohz_extended_cpu(int cpu) { return 0; } +#endif + + # ifdef CONFIG_CPU_IDLE_GOV_MENU extern void menu_hrtimer_cancel(void); # else diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 24510d84efd7..5a87c03e45ad 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -79,6 +79,25 @@ config NO_HZ only trigger on an as-needed basis both when the system is busy and when the system is idle. +config NO_HZ_EXTENDED + bool "Full dynticks system" + depends on NO_HZ && RCU_USER_QS && VIRT_CPU_ACCOUNTING_GEN && RCU_NOCB_CPU && SMP + select CONTEXT_TRACKING_FORCE + help + Adaptively try to shutdown the tick whenever possible, even when + the CPU is running tasks. Typically this requires running a single + task on the CPU. Chances for running tickless are maximized when + the task mostly runs in userspace and has few kernel activity. + + You need to fill up the nohz_extended boot parameter with the + desired range of dynticks CPUs. + + This is implemented at the expense of some overhead in user <-> kernel + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + + Say N. + config HIGH_RES_TIMERS bool "High Resolution Timer Support" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a19a39952c1b..79c275f08b7d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -142,6 +142,68 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) profile_tick(CPU_PROFILING); } +#ifdef CONFIG_NO_HZ_EXTENDED +static cpumask_var_t nohz_extended_mask; +bool have_nohz_extended_mask; + +int tick_nohz_extended_cpu(int cpu) +{ + if (!have_nohz_extended_mask) + return 0; + + return cpumask_test_cpu(cpu, nohz_extended_mask); +} + +/* Parse the boot-time nohz CPU list from the kernel parameters. */ +static int __init tick_nohz_extended_setup(char *str) +{ + alloc_bootmem_cpumask_var(&nohz_extended_mask); + if (cpulist_parse(str, nohz_extended_mask) < 0) + pr_warning("NOHZ: Incorrect nohz_extended cpumask\n"); + else + have_nohz_extended_mask = true; + return 1; +} +__setup("nohz_extended=", tick_nohz_extended_setup); + +static int __init init_tick_nohz_extended(void) +{ + cpumask_var_t online_nohz; + int cpu; + + if (!have_nohz_extended_mask) + return 0; + + if (!zalloc_cpumask_var(&online_nohz, GFP_KERNEL)) { + pr_warning("NO_HZ: Not enough memory to check extended nohz mask\n"); + return -ENOMEM; + } + + /* + * CPUs can probably not be concurrently offlined on initcall time. + * But we are paranoid, aren't we? + */ + get_online_cpus(); + + /* Ensure we keep a CPU outside the dynticks range for timekeeping */ + cpumask_and(online_nohz, cpu_online_mask, nohz_extended_mask); + if (cpumask_equal(online_nohz, cpu_online_mask)) { + cpu = cpumask_any(cpu_online_mask); + pr_warning("NO_HZ: Must keep at least one online CPU " + "out of nohz_extended range\n"); + pr_warning("NO_HZ: Clearing %d from nohz_extended range\n", cpu); + cpumask_clear_cpu(cpu, nohz_extended_mask); + } + put_online_cpus(); + free_cpumask_var(online_nohz); + + return 0; +} +core_initcall(init_tick_nohz_extended); +#else +#define have_nohz_extended_mask (0) +#endif + /* * NOHZ - aka dynamic tick functionality */ From a382bf934449ddeb625167537ae81daa0211b477 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 18 Dec 2012 18:24:35 +0100 Subject: [PATCH 02/39] nohz: Assign timekeeping duty to a CPU outside the full dynticks range This way the full nohz CPUs can safely run with the tick stopped with a guarantee that somebody else is taking care of the jiffies and GTOD progression. Once the duty is attributed to a CPU, it won't change. Also that CPU can't enter into dyntick idle mode or be hot unplugged. This may later be improved from a power consumption POV. At least we should be able to share the duty amongst all CPUs outside the full dynticks range. Then the duty could even be shared with full dynticks CPUs when those can't stop their tick for any reason. But let's start with that very simple approach first. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner [fix have_nohz_full_mask offcase] Signed-off-by: Steven Rostedt --- kernel/time/tick-broadcast.c | 3 ++- kernel/time/tick-common.c | 5 +++- kernel/time/tick-sched.c | 47 ++++++++++++++++++++++++++++++++++-- 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2fb8cb88df8d..8a6875cc1879 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -573,7 +573,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) bc->event_handler = tick_handle_oneshot_broadcast; /* Take the do_timer update */ - tick_do_timer_cpu = cpu; + if (!tick_nohz_extended_cpu(cpu)) + tick_do_timer_cpu = cpu; /* * We must be careful here. There might be other CPUs diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b1600a6973f4..b7dc0cbdb59b 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - tick_do_timer_cpu = cpu; + if (!tick_nohz_extended_cpu(cpu)) + tick_do_timer_cpu = cpu; + else + tick_do_timer_cpu = TICK_DO_TIMER_NONE; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 79c275f08b7d..57bb3fe5aaa3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -112,7 +112,8 @@ static void tick_sched_do_timer(ktime_t now) * this duty, then the jiffies update is still serialized by * jiffies_lock. */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) + && !tick_nohz_extended_cpu(cpu)) tick_do_timer_cpu = cpu; #endif @@ -166,6 +167,25 @@ static int __init tick_nohz_extended_setup(char *str) } __setup("nohz_extended=", tick_nohz_extended_setup); +static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* + * If we handle the timekeeping duty for full dynticks CPUs, + * we can't safely shutdown that CPU. + */ + if (have_nohz_extended_mask && tick_do_timer_cpu == cpu) + return -EINVAL; + break; + } + return NOTIFY_OK; +} + static int __init init_tick_nohz_extended(void) { cpumask_var_t online_nohz; @@ -174,6 +194,8 @@ static int __init init_tick_nohz_extended(void) if (!have_nohz_extended_mask) return 0; + cpu_notifier(tick_nohz_cpu_down_callback, 0); + if (!zalloc_cpumask_var(&online_nohz, GFP_KERNEL)) { pr_warning("NO_HZ: Not enough memory to check extended nohz mask\n"); return -ENOMEM; @@ -188,11 +210,17 @@ static int __init init_tick_nohz_extended(void) /* Ensure we keep a CPU outside the dynticks range for timekeeping */ cpumask_and(online_nohz, cpu_online_mask, nohz_extended_mask); if (cpumask_equal(online_nohz, cpu_online_mask)) { - cpu = cpumask_any(cpu_online_mask); pr_warning("NO_HZ: Must keep at least one online CPU " "out of nohz_extended range\n"); + /* + * We know the current CPU doesn't have its tick stopped. + * Let's use it for the timekeeping duty. + */ + preempt_disable(); + cpu = smp_processor_id(); pr_warning("NO_HZ: Clearing %d from nohz_extended range\n", cpu); cpumask_clear_cpu(cpu, nohz_extended_mask); + preempt_enable(); } put_online_cpus(); free_cpumask_var(online_nohz); @@ -551,6 +579,21 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } + if (have_nohz_extended_mask) { + /* + * Keep the tick alive to guarantee timekeeping progression + * if there are full dynticks CPUs around + */ + if (tick_do_timer_cpu == cpu) + return false; + /* + * Boot safety: make sure the timekeeping duty has been + * assigned before entering dyntick-idle mode, + */ + if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) + return false; + } + return true; } From 1c20091e77fc5a9b7d7d905176443b4822a23cdb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 10 Aug 2011 23:21:01 +0200 Subject: [PATCH 03/39] nohz: Wake up full dynticks CPUs when a timer gets enqueued Wake up a CPU when a timer list timer is enqueued there and the target is part of the full dynticks range. Sending an IPI to it makes it reconsidering the next timer to program on top of recent updates. This may later be improved by checking if the tick is really stopped on the target. This would need some careful synchronization though. So deal with such optimization later and start simple. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/linux/sched.h | 4 ++-- kernel/sched/core.c | 20 +++++++++++++++++++- kernel/timer.c | 12 ++++++------ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9004f6e19eac..10626e2ee688 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1851,9 +1851,9 @@ static inline void idle_task_exit(void) {} #endif #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) -extern void wake_up_idle_cpu(int cpu); +extern void wake_up_nohz_cpu(int cpu); #else -static inline void wake_up_idle_cpu(int cpu) { } +static inline void wake_up_nohz_cpu(int cpu) { } #endif #ifdef CONFIG_SCHED_AUTOGROUP diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 849deb96e61e..e91ee589f793 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -587,7 +587,7 @@ unlock: * account when the CPU goes back to idle and evaluates the timer * wheel for the next timer event. */ -void wake_up_idle_cpu(int cpu) +static void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -617,6 +617,24 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } +static bool wake_up_extended_nohz_cpu(int cpu) +{ + if (tick_nohz_extended_cpu(cpu)) { + if (cpu != smp_processor_id() || + tick_nohz_tick_stopped()) + smp_send_reschedule(cpu); + return true; + } + + return false; +} + +void wake_up_nohz_cpu(int cpu) +{ + if (!wake_up_extended_nohz_cpu(cpu)) + wake_up_idle_cpu(cpu); +} + static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); diff --git a/kernel/timer.c b/kernel/timer.c index dbf7a78a1ef1..4e3040b40d16 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -930,14 +930,14 @@ void add_timer_on(struct timer_list *timer, int cpu) debug_activate(timer, timer->expires); internal_add_timer(base, timer); /* - * Check whether the other CPU is idle and needs to be - * triggered to reevaluate the timer wheel when nohz is - * active. We are protected against the other CPU fiddling + * Check whether the other CPU is in dynticks mode and needs + * to be triggered to reevaluate the timer wheel. + * We are protected against the other CPU fiddling * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to idle can not evaluate - * the timer wheel. + * makes sure that a CPU on the way to stop its tick can not + * evaluate the timer wheel. */ - wake_up_idle_cpu(cpu); + wake_up_nohz_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); From ab71d36ddb9e60d4ddb28a187718815d38c3c666 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 10 Aug 2011 23:21:01 +0200 Subject: [PATCH 04/39] nohz: Unhide full dynticks feature from its dependencies The full dynticks feature only shows up when all its Kconfig dependencies are met (RCU nocbs, RCU user mode, ...) This is far from being user friendly as those who want to activate this feature need to look into the Kconfig files and iterate through each dependency then activate these by hand in order to show and select the full dynticks Kconfig option. So process the other way around: show up the Kconfig option if the minimal low level dependencies are met and activate the high level ones when we enable the feature. Note there is one exception in the picture: CONFIG_VIRT_CPU_ACCOUNTING_GEN is part of a Kconfig choice menu and it appears we can't select it from another Kconfig selection when it's under such layout. So for now this particular item stays as a passive dependency. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/time/Kconfig | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 5a87c03e45ad..726c33e00da2 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -80,11 +80,20 @@ config NO_HZ busy and when the system is idle. config NO_HZ_EXTENDED - bool "Full dynticks system" - depends on NO_HZ && RCU_USER_QS && VIRT_CPU_ACCOUNTING_GEN && RCU_NOCB_CPU && SMP - select CONTEXT_TRACKING_FORCE - help - Adaptively try to shutdown the tick whenever possible, even when + bool "Full dynticks system" + # NO_HZ dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + # RCU_USER_QS + depends on HAVE_CONTEXT_TRACKING && SMP + # RCU_NOCB_CPU dependency + depends on TREE_RCU || TREE_PREEMPT_RCU + depends on VIRT_CPU_ACCOUNTING_GEN + select NO_HZ + select RCU_USER_QS + select RCU_NOCB_CPU + select CONTEXT_TRACKING_FORCE + help + Adaptively try to shutdown the tick whenever possible, even when the CPU is running tasks. Typically this requires running a single task on the CPU. Chances for running tickless are maximized when the task mostly runs in userspace and has few kernel activity. From 3451d0243c3cdfd729b36f9684a14659d4895ca3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 10 Aug 2011 23:21:01 +0200 Subject: [PATCH 05/39] nohz: Rename CONFIG_NO_HZ to CONFIG_NO_HZ_COMMON We are planning to convert the dynticks Kconfig options layout into a choice menu. The user must be able to easily pick any of the following implementations: constant periodic tick, idle dynticks, full dynticks. As this implies a mutual exclusion, the two dynticks implementions need to converge on the selection of a common Kconfig option in order to ease the sharing of a common infrastructure. It would thus seem pretty natural to reuse CONFIG_NO_HZ to that end. It already implements all the idle dynticks code and the full dynticks depends on all that code for now. So ideally the choice menu would propose CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED then both would select CONFIG_NO_HZ. On the other hand we want to stay backward compatible: if CONFIG_NO_HZ is set in an older config file, we want to enable CONFIG_NO_HZ_IDLE by default. But we can't afford both at the same time or we run into a circular dependency: 1) CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED both select CONFIG_NO_HZ 2) If CONFIG_NO_HZ is set, we default to CONFIG_NO_HZ_IDLE We might be able to support that from Kconfig/Kbuild but it may not be wise to introduce such a confusing behaviour. So to solve this, create a new CONFIG_NO_HZ_COMMON option which gathers the common code between idle and full dynticks (that common code for now is simply the idle dynticks code) and select it from their referring Kconfig. Then we'll later create CONFIG_NO_HZ_IDLE and map CONFIG_NO_HZ to it for backward compatibility. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- Documentation/RCU/stallwarn.txt | 2 +- Documentation/cpu-freq/governors.txt | 4 ++-- arch/um/include/shared/common-offsets.h | 4 ++-- arch/um/os-Linux/time.c | 2 +- include/linux/sched.h | 8 ++++---- include/linux/tick.h | 8 ++++---- init/Kconfig | 2 +- kernel/hrtimer.c | 4 ++-- kernel/sched/core.c | 18 +++++++++--------- kernel/sched/fair.c | 10 +++++----- kernel/sched/sched.h | 4 ++-- kernel/softirq.c | 2 +- kernel/time/Kconfig | 13 +++++++++---- kernel/time/tick-sched.c | 12 ++++++------ kernel/timer.c | 4 ++-- 15 files changed, 51 insertions(+), 46 deletions(-) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 1927151b386b..b336755b71ed 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -176,7 +176,7 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that o A hardware or software issue shuts off the scheduler-clock interrupt on a CPU that is not in dyntick-idle mode. This problem really has happened, and seems to be most likely to - result in RCU CPU stall warnings for CONFIG_NO_HZ=n kernels. + result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. o A bug in the RCU implementation. diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt index c7a2eb8450c2..e3e5d9ae50cd 100644 --- a/Documentation/cpu-freq/governors.txt +++ b/Documentation/cpu-freq/governors.txt @@ -131,8 +131,8 @@ sampling_rate_min: The sampling rate is limited by the HW transition latency: transition_latency * 100 Or by kernel restrictions: -If CONFIG_NO_HZ is set, the limit is 10ms fixed. -If CONFIG_NO_HZ is not set or nohz=off boot parameter is used, the +If CONFIG_NO_HZ_COMMON is set, the limit is 10ms fixed. +If CONFIG_NO_HZ_COMMON is not set or nohz=off boot parameter is used, the limits depend on the CONFIG_HZ option: HZ=1000: min=20000us (20ms) HZ=250: min=80000us (80ms) diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 2df313b6a586..c92306809029 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -30,8 +30,8 @@ DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC); #ifdef CONFIG_PRINTK DEFINE(UML_CONFIG_PRINTK, CONFIG_PRINTK); #endif -#ifdef CONFIG_NO_HZ -DEFINE(UML_CONFIG_NO_HZ, CONFIG_NO_HZ); +#ifdef CONFIG_NO_HZ_COMMON +DEFINE(UML_CONFIG_NO_HZ_COMMON, CONFIG_NO_HZ_COMMON); #endif #ifdef CONFIG_UML_X86 DEFINE(UML_CONFIG_UML_X86, CONFIG_UML_X86); diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c index fac388cb464f..e9824d5dd7d5 100644 --- a/arch/um/os-Linux/time.c +++ b/arch/um/os-Linux/time.c @@ -79,7 +79,7 @@ long long os_nsecs(void) return timeval_to_ns(&tv); } -#ifdef UML_CONFIG_NO_HZ +#ifdef UML_CONFIG_NO_HZ_COMMON static int after_sleep_interval(struct timespec *ts) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index 10626e2ee688..1ff9e0a5de27 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -230,7 +230,7 @@ extern void init_idle_bootup_task(struct task_struct *idle); extern int runqueue_is_locked(int cpu); -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); extern int get_nohz_timer_target(void); @@ -1758,13 +1758,13 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, } #endif -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON void calc_load_enter_idle(void); void calc_load_exit_idle(void); #else static inline void calc_load_enter_idle(void) { } static inline void calc_load_exit_idle(void) { } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ #ifndef CONFIG_CPUMASK_OFFSTACK static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) @@ -1850,7 +1850,7 @@ extern void idle_task_exit(void); static inline void idle_task_exit(void) {} #endif -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) +#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) extern void wake_up_nohz_cpu(int cpu); #else static inline void wake_up_nohz_cpu(int cpu) { } diff --git a/include/linux/tick.h b/include/linux/tick.h index 44bfa8aa439f..5e403339ee14 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -82,7 +82,7 @@ extern int tick_program_event(ktime_t expires, int force); extern void tick_setup_sched_timer(void); # endif -# if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +# if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS extern void tick_cancel_sched_timer(int cpu); # else static inline void tick_cancel_sched_timer(int cpu) { } @@ -123,7 +123,7 @@ static inline void tick_check_idle(int cpu) { } static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ -# ifdef CONFIG_NO_HZ +# ifdef CONFIG_NO_HZ_COMMON DECLARE_PER_CPU(struct tick_sched, tick_cpu_sched); static inline int tick_nohz_tick_stopped(void) @@ -138,7 +138,7 @@ extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); -# else /* !CONFIG_NO_HZ */ +# else /* !CONFIG_NO_HZ_COMMON */ static inline int tick_nohz_tick_stopped(void) { return 0; @@ -155,7 +155,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void) } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } -# endif /* !NO_HZ */ +# endif /* !CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_EXTENDED extern int tick_nohz_extended_cpu(int cpu); diff --git a/init/Kconfig b/init/Kconfig index 8a1dac2f80a9..edc8132584f1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -580,7 +580,7 @@ config RCU_FANOUT_EXACT config RCU_FAST_NO_HZ bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on NO_HZ && SMP + depends on NO_HZ_COMMON && SMP default n help This option causes RCU to attempt to accelerate grace periods in diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cc47812d3feb..ec60482d8b03 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -160,7 +160,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, */ static int hrtimer_get_target(int this_cpu, int pinned) { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) return get_nohz_timer_target(); #endif @@ -1106,7 +1106,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e91ee589f793..9bb397da63d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -549,7 +549,7 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers * from an idle cpu. This is good for power-savings. @@ -641,14 +641,14 @@ static inline bool got_nohz_idle_kick(void) return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); } -#else /* CONFIG_NO_HZ */ +#else /* CONFIG_NO_HZ_COMMON */ static inline bool got_nohz_idle_kick(void) { return false; } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ void sched_avg_update(struct rq *rq) { @@ -2139,7 +2139,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) return load >> FSHIFT; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. * @@ -2365,12 +2365,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */ static inline long calc_load_fold_idle(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2530,7 +2530,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -2590,7 +2590,7 @@ void update_cpu_load_nohz(void) } raw_spin_unlock(&this_rq->lock); } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * Called from scheduler_tick() @@ -7023,7 +7023,7 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON rq->nohz_flags = 0; #endif #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 539760ef00c4..5c97fca091a7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5331,7 +5331,7 @@ out_unlock: return 0; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details * - When one of the busy CPUs notice that there may be an idle rebalancing @@ -5541,9 +5541,9 @@ out: rq->next_balance = next_balance; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* - * In CONFIG_NO_HZ case, the idle balance kickee will do the + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) @@ -5686,7 +5686,7 @@ void trigger_load_balance(struct rq *rq, int cpu) if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) nohz_balancer_kick(cpu); #endif @@ -6156,7 +6156,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3bd15a43eebc..889904dd6d77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -404,7 +404,7 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; #endif @@ -1333,7 +1333,7 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); extern void account_cfs_bandwidth_used(int enabled, int was_enabled); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON enum rq_nohz_flag_bits { NOHZ_TICK_STOPPED, NOHZ_BALANCE_KICK, diff --git a/kernel/softirq.c b/kernel/softirq.c index b4d252fd195b..de15813f2a66 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -348,7 +348,7 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* Make sure that timer wheel updates are propagated */ if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) tick_nohz_irq_exit(); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 726c33e00da2..c88fc43494c9 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -64,16 +64,21 @@ config GENERIC_CMOS_UPDATE if GENERIC_CLOCKEVENTS menu "Timers subsystem" -# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is +# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independ of this. config TICK_ONESHOT bool +config NO_HZ_COMMON + bool + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + config NO_HZ bool "Tickless System (Dynamic Ticks)" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - select TICK_ONESHOT + select NO_HZ_COMMON help This option enables a tickless system: timer interrupts will only trigger on an as-needed basis both when the system is @@ -81,14 +86,14 @@ config NO_HZ config NO_HZ_EXTENDED bool "Full dynticks system" - # NO_HZ dependency + # NO_HZ_COMMON dependency depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS # RCU_USER_QS depends on HAVE_CONTEXT_TRACKING && SMP # RCU_NOCB_CPU dependency depends on TREE_RCU || TREE_PREEMPT_RCU depends on VIRT_CPU_ACCOUNTING_GEN - select NO_HZ + select NO_HZ_COMMON select RCU_USER_QS select RCU_NOCB_CPU select CONTEXT_TRACKING_FORCE diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 57bb3fe5aaa3..ccfc2086cd4b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -104,7 +104,7 @@ static void tick_sched_do_timer(ktime_t now) { int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the cpu in charge went @@ -124,7 +124,7 @@ static void tick_sched_do_timer(ktime_t now) static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long @@ -235,7 +235,7 @@ core_initcall(init_tick_nohz_extended); /* * NOHZ - aka dynamic tick functionality */ -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ @@ -907,7 +907,7 @@ static inline void tick_check_nohz(int cpu) static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_check_nohz(int cpu) { } -#endif /* NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() @@ -992,14 +992,14 @@ void tick_setup_sched_timer(void) now = ktime_get(); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (tick_nohz_enabled) ts->nohz_mode = NOHZ_MODE_HIGHRES; #endif } #endif /* HIGH_RES_TIMERS */ -#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); diff --git a/kernel/timer.c b/kernel/timer.c index 4e3040b40d16..1b7489fdea41 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -738,7 +738,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = smp_processor_id(); -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) +#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) cpu = get_nohz_timer_target(); #endif @@ -1188,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base) spin_unlock_irq(&base->lock); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Find out when the next timer event is due to happen. This * is used on S/390 to stop all activity when a CPU is idle. From 3ca277e41914ab344214ed50a41c14c48ae973f3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 10 Aug 2011 23:21:01 +0200 Subject: [PATCH 06/39] nohz: Pack nohz Kconfig option in a menu of choices Now the user has the choice between three implementations of the timer tick: * Static periodic tick * Idle dynticks * Full dynticks At least for now, these are mutually exclusive choices, so let's rely on the proper Kconfig feature to display these to the user. A new entry CONFIG_NO_HZ_IDLE is created and the old CONFIG_NO_HZ maps to it for config file backward compatibility. The old name was too general now that we have more granular dynticks implementations. While at it, add some explanation to help the user on his decision between the 3 entries. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/time/Kconfig | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index c88fc43494c9..27cc404ea187 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -75,17 +75,33 @@ config NO_HZ_COMMON depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT +# Kept around for compatibility, maps to NO_HZ_IDLE config NO_HZ - bool "Tickless System (Dynamic Ticks)" + bool + +choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ + +config PERIODIC_HZ + bool "Periodic timer ticks (constant rate, no dynticks)" + help + This option keeps the tick running periodically at a constant + rate, even when the CPU doesn't need it. + +config NO_HZ_IDLE + bool "Idle dynticks system (tickless idle)" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select NO_HZ_COMMON help - This option enables a tickless system: timer interrupts will - only trigger on an as-needed basis both when the system is - busy and when the system is idle. + This option enables a tickless idle system: timer interrupts + will only trigger on an as-needed basis when the system is idle. + This is usually interesting for energy saving. + + Most of the time you want to say Y here. config NO_HZ_EXTENDED - bool "Full dynticks system" + bool "Full dynticks system (tickless single task)" # NO_HZ_COMMON dependency depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS # RCU_USER_QS @@ -112,6 +128,8 @@ config NO_HZ_EXTENDED Say N. +endchoice + config HIGH_RES_TIMERS bool "High Resolution Timer Support" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS From 1034fc2f41aaf32f782a9362178f9a236ac5a50a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Mar 2013 15:04:50 +0100 Subject: [PATCH 07/39] nohz: Print final full dynticks CPUs range on boot Given that we apply a few restrictions on the full dynticks CPUs range (keep an online timekeeper oustide the range, then in the future have the range be an RCU nocb CPUs subset), let's print the final resulting range of full dynticks CPUs to the user so that he knows what's really going to run. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ccfc2086cd4b..e057d338daa4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -186,6 +186,13 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, return NOTIFY_OK; } +/* + * Worst case string length in chunks of CPU range seems 2 steps + * separations: 0,2,4,6,... + * This is NR_CPUS + sizeof('\0') + */ +static char __initdata nohz_ext_buf[NR_CPUS + 1]; + static int __init init_tick_nohz_extended(void) { cpumask_var_t online_nohz; @@ -225,6 +232,9 @@ static int __init init_tick_nohz_extended(void) put_online_cpus(); free_cpumask_var(online_nohz); + cpulist_scnprintf(nohz_ext_buf, sizeof(nohz_ext_buf), nohz_extended_mask); + pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_ext_buf); + return 0; } core_initcall(init_tick_nohz_extended); From 0644ca5c774f1a7033bfc83c7ed0660f74f28b56 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 16:40:12 +0200 Subject: [PATCH 08/39] nohz: Fix old dynticks idle Kconfig backward compatibility In order to enforce backward compatibility with older config files, we want the new dynticks-idle Kconfig entry to default its value to the one of the old CONFIG_NO_HZ symbol if present. Namely we want: config NO_HZ # old obsolete dynticks idle symbol bool config NO_HZ_IDLE # new dynticks idle symbol default NO_HZ However Kconfig prevents this to work if the old symbol is not visible. And this is currently the case because NO_HZ lacks a title in order to show it in make oldconfig and alike. To fix this, bring a minimal title and help text to the obsolete Kconfig entry that explains its purpose. This makes the "defaulting" to work. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/time/Kconfig | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 27cc404ea187..cbe64be17d1f 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -75,10 +75,6 @@ config NO_HZ_COMMON depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT -# Kept around for compatibility, maps to NO_HZ_IDLE -config NO_HZ - bool - choice prompt "Timer tick handling" default NO_HZ_IDLE if NO_HZ @@ -130,6 +126,14 @@ config NO_HZ_EXTENDED endchoice +config NO_HZ + bool "Old Idle dynticks config" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + help + This is the old config entry that enables dynticks idle. + We keep it around for a little while to enforce backward + compatibility with older config files. + config HIGH_RES_TIMERS bool "High Resolution Timer Support" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS From c5bfece2d6129131b4ade985e63bc35ddf5868d4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 16:45:34 +0200 Subject: [PATCH 09/39] nohz: Switch from "extended nohz" to "full nohz" based naming "Extended nohz" was used as a naming base for the full dynticks API and Kconfig symbols. It reflects the fact the system tries to stop the tick in more places than just idle. But that "extended" name is a bit opaque and vague. Rename it to "full" makes it clearer what the system tries to do under this config: try to shutdown the tick anytime it can. The various constraints that prevent that to happen shouldn't be considered as fundamental properties of this feature but rather technical issues that may be solved in the future. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- Documentation/kernel-parameters.txt | 4 +-- include/linux/tick.h | 6 ++-- kernel/sched/core.c | 6 ++-- kernel/time/Kconfig | 4 +-- kernel/time/tick-broadcast.c | 2 +- kernel/time/tick-common.c | 2 +- kernel/time/tick-sched.c | 54 ++++++++++++++--------------- 7 files changed, 39 insertions(+), 39 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 231698feaddc..82365dde00a8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1913,8 +1913,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Valid arguments: on, off Default: on - nohz_extended= [KNL,BOOT] - In kernels built with CONFIG_NO_HZ_EXTENDED=y, set + nohz_full= [KNL,BOOT] + In kernels built with CONFIG_NO_HZ_FULL=y, set the specified list of CPUs whose tick will be stopped whenever possible. You need to keep at least one online CPU outside the range to maintain the timekeeping. diff --git a/include/linux/tick.h b/include/linux/tick.h index 5e403339ee14..b4e3b0c9639e 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -157,10 +157,10 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_NO_HZ_EXTENDED -extern int tick_nohz_extended_cpu(int cpu); +#ifdef CONFIG_NO_HZ_FULL +extern int tick_nohz_full_cpu(int cpu); #else -static inline int tick_nohz_extended_cpu(int cpu) { return 0; } +static inline int tick_nohz_full_cpu(int cpu) { return 0; } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9bb397da63d6..0f0a5b3fd62c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -617,9 +617,9 @@ static void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } -static bool wake_up_extended_nohz_cpu(int cpu) +static bool wake_up_full_nohz_cpu(int cpu) { - if (tick_nohz_extended_cpu(cpu)) { + if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) smp_send_reschedule(cpu); @@ -631,7 +631,7 @@ static bool wake_up_extended_nohz_cpu(int cpu) void wake_up_nohz_cpu(int cpu) { - if (!wake_up_extended_nohz_cpu(cpu)) + if (!wake_up_full_nohz_cpu(cpu)) wake_up_idle_cpu(cpu); } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index cbe64be17d1f..4a17b5069466 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -96,7 +96,7 @@ config NO_HZ_IDLE Most of the time you want to say Y here. -config NO_HZ_EXTENDED +config NO_HZ_FULL bool "Full dynticks system (tickless single task)" # NO_HZ_COMMON dependency depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS @@ -115,7 +115,7 @@ config NO_HZ_EXTENDED task on the CPU. Chances for running tickless are maximized when the task mostly runs in userspace and has few kernel activity. - You need to fill up the nohz_extended boot parameter with the + You need to fill up the nohz_full boot parameter with the desired range of dynticks CPUs. This is implemented at the expense of some overhead in user <-> kernel diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 8a6875cc1879..a3a3123f6272 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -573,7 +573,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) bc->event_handler = tick_handle_oneshot_broadcast; /* Take the do_timer update */ - if (!tick_nohz_extended_cpu(cpu)) + if (!tick_nohz_full_cpu(cpu)) tick_do_timer_cpu = cpu; /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b7dc0cbdb59b..83f2bd967161 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -163,7 +163,7 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - if (!tick_nohz_extended_cpu(cpu)) + if (!tick_nohz_full_cpu(cpu)) tick_do_timer_cpu = cpu; else tick_do_timer_cpu = TICK_DO_TIMER_NONE; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e057d338daa4..369b5769fc97 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -113,7 +113,7 @@ static void tick_sched_do_timer(ktime_t now) * jiffies_lock. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) - && !tick_nohz_extended_cpu(cpu)) + && !tick_nohz_full_cpu(cpu)) tick_do_timer_cpu = cpu; #endif @@ -143,29 +143,29 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) profile_tick(CPU_PROFILING); } -#ifdef CONFIG_NO_HZ_EXTENDED -static cpumask_var_t nohz_extended_mask; -bool have_nohz_extended_mask; +#ifdef CONFIG_NO_HZ_FULL +static cpumask_var_t nohz_full_mask; +bool have_nohz_full_mask; -int tick_nohz_extended_cpu(int cpu) +int tick_nohz_full_cpu(int cpu) { - if (!have_nohz_extended_mask) + if (!have_nohz_full_mask) return 0; - return cpumask_test_cpu(cpu, nohz_extended_mask); + return cpumask_test_cpu(cpu, nohz_full_mask); } /* Parse the boot-time nohz CPU list from the kernel parameters. */ -static int __init tick_nohz_extended_setup(char *str) +static int __init tick_nohz_full_setup(char *str) { - alloc_bootmem_cpumask_var(&nohz_extended_mask); - if (cpulist_parse(str, nohz_extended_mask) < 0) - pr_warning("NOHZ: Incorrect nohz_extended cpumask\n"); + alloc_bootmem_cpumask_var(&nohz_full_mask); + if (cpulist_parse(str, nohz_full_mask) < 0) + pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); else - have_nohz_extended_mask = true; + have_nohz_full_mask = true; return 1; } -__setup("nohz_extended=", tick_nohz_extended_setup); +__setup("nohz_full=", tick_nohz_full_setup); static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, unsigned long action, @@ -179,7 +179,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, * If we handle the timekeeping duty for full dynticks CPUs, * we can't safely shutdown that CPU. */ - if (have_nohz_extended_mask && tick_do_timer_cpu == cpu) + if (have_nohz_full_mask && tick_do_timer_cpu == cpu) return -EINVAL; break; } @@ -191,20 +191,20 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, * separations: 0,2,4,6,... * This is NR_CPUS + sizeof('\0') */ -static char __initdata nohz_ext_buf[NR_CPUS + 1]; +static char __initdata nohz_full_buf[NR_CPUS + 1]; -static int __init init_tick_nohz_extended(void) +static int __init init_tick_nohz_full(void) { cpumask_var_t online_nohz; int cpu; - if (!have_nohz_extended_mask) + if (!have_nohz_full_mask) return 0; cpu_notifier(tick_nohz_cpu_down_callback, 0); if (!zalloc_cpumask_var(&online_nohz, GFP_KERNEL)) { - pr_warning("NO_HZ: Not enough memory to check extended nohz mask\n"); + pr_warning("NO_HZ: Not enough memory to check full nohz mask\n"); return -ENOMEM; } @@ -215,31 +215,31 @@ static int __init init_tick_nohz_extended(void) get_online_cpus(); /* Ensure we keep a CPU outside the dynticks range for timekeeping */ - cpumask_and(online_nohz, cpu_online_mask, nohz_extended_mask); + cpumask_and(online_nohz, cpu_online_mask, nohz_full_mask); if (cpumask_equal(online_nohz, cpu_online_mask)) { pr_warning("NO_HZ: Must keep at least one online CPU " - "out of nohz_extended range\n"); + "out of nohz_full range\n"); /* * We know the current CPU doesn't have its tick stopped. * Let's use it for the timekeeping duty. */ preempt_disable(); cpu = smp_processor_id(); - pr_warning("NO_HZ: Clearing %d from nohz_extended range\n", cpu); - cpumask_clear_cpu(cpu, nohz_extended_mask); + pr_warning("NO_HZ: Clearing %d from nohz_full range\n", cpu); + cpumask_clear_cpu(cpu, nohz_full_mask); preempt_enable(); } put_online_cpus(); free_cpumask_var(online_nohz); - cpulist_scnprintf(nohz_ext_buf, sizeof(nohz_ext_buf), nohz_extended_mask); - pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_ext_buf); + cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); + pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); return 0; } -core_initcall(init_tick_nohz_extended); +core_initcall(init_tick_nohz_full); #else -#define have_nohz_extended_mask (0) +#define have_nohz_full_mask (0) #endif /* @@ -589,7 +589,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (have_nohz_extended_mask) { + if (have_nohz_full_mask) { /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around From 5b533f4ff5b6d8f2442f2ac82ca1530ff3f55267 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 13 Apr 2013 16:53:35 +0200 Subject: [PATCH 10/39] nohz: Align periodic tick Kconfig with other choices' naming convention Rename CONFIG_PERIODIC_HZ to CONFIG_HZ_PERIODIC in order to stay consistent with other tick implementation entries: CONFIG_HZ_PERIODIC CONFIG_NO_HZ_IDLE CONFIG_NO_HZ_FULL Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/time/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 4a17b5069466..3b683227a10d 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -79,7 +79,7 @@ choice prompt "Timer tick handling" default NO_HZ_IDLE if NO_HZ -config PERIODIC_HZ +config HZ_PERIODIC bool "Periodic timer ticks (constant rate, no dynticks)" help This option keeps the tick running periodically at a constant From fae30dd6698ecaa93cb50213266c224bc550c32c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 13 Apr 2013 17:04:04 +0200 Subject: [PATCH 11/39] nohz: Improve a bit the full dynticks Kconfig documentation Remove the "single task" statement from CONFIG_NO_HZ_FULL title. The constraint can be invalidated when tasks from other sched classes than SCHED_FAIR are running. Moreover it's possible that hrtick join the party in the future. Also add a line about the dependency on SMP. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/time/Kconfig | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 3b683227a10d..358d601a4fec 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -97,11 +97,13 @@ config NO_HZ_IDLE Most of the time you want to say Y here. config NO_HZ_FULL - bool "Full dynticks system (tickless single task)" + bool "Full dynticks system (tickless)" # NO_HZ_COMMON dependency depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - # RCU_USER_QS - depends on HAVE_CONTEXT_TRACKING && SMP + # We need at least one periodic CPU for timekeeping + depends on SMP + # RCU_USER_QS dependency + depends on HAVE_CONTEXT_TRACKING # RCU_NOCB_CPU dependency depends on TREE_RCU || TREE_PREEMPT_RCU depends on VIRT_CPU_ACCOUNTING_GEN From 65d798f0f9339ae2c4ebe9480e3260b33382a584 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Apr 2013 16:19:10 -0700 Subject: [PATCH 12/39] rcu: Kick adaptive-ticks CPUs that are holding up RCU grace periods Adaptive-ticks CPUs inform RCU when they enter kernel mode, but they do not necessarily turn the scheduler-clock tick back on. This state of affairs could result in RCU waiting on an adaptive-ticks CPU running for an extended period in kernel mode. Such a CPU will never run the RCU state machine, and could therefore indefinitely extend the RCU state machine, sooner or later resulting in an OOM condition. This patch, inspired by an earlier patch by Frederic Weisbecker, therefore causes RCU's force-quiescent-state processing to check for this condition and to send an IPI to CPUs that remain in that state for too long. "Too long" currently means about three jiffies by default, which is quite some time for a CPU to remain in the kernel without blocking. The rcu_tree.jiffies_till_first_fqs and rcutree.jiffies_till_next_fqs sysfs variables may be used to tune "too long" if needed. Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/rcutree.c | 10 ++++++++++ kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 18 ++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5b8ad827fd86..f5ab50235cba 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -794,6 +794,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rdp->offline_fqs++; return 1; } + + /* + * There is a possibility that a CPU in adaptive-ticks state + * might run in the kernel with the scheduling-clock tick disabled + * for an extended time period. Invoke rcu_kick_nohz_cpu() to + * force the CPU to restart the scheduling-clock tick in this + * CPU is in this state. + */ + rcu_kick_nohz_cpu(rdp->cpu); + return 0; } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index c896b5045d9d..f993c0ac47db 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -539,6 +539,7 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); static void init_nocb_callback_list(struct rcu_data *rdp); static void __init rcu_init_nocb(void); +static void rcu_kick_nohz_cpu(int cpu); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c1cc7e17ff9d..a5745e9b5d5a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,6 +28,7 @@ #include #include #include +#include #define RCU_KTHREAD_PRIO 1 @@ -2502,3 +2503,20 @@ static void __init rcu_init_nocb(void) } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ + +/* + * An adaptive-ticks CPU can potentially execute in kernel mode for an + * arbitrarily long period of time with the scheduling-clock tick turned + * off. RCU will be paying attention to this CPU because it is in the + * kernel, but the CPU cannot be guaranteed to be executing the RCU state + * machine because the scheduling-clock tick has been disabled. Therefore, + * if an adaptive-ticks CPU is failing to respond to the current grace + * period and has not be idle from an RCU perspective, kick it. + */ +static void rcu_kick_nohz_cpu(int cpu) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_cpu(cpu)) + smp_send_reschedule(cpu); +#endif /* #ifdef CONFIG_NO_HZ_FULL */ +} From 76c24fb054b52b34af4dcde589cbb9e2b98fc74c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 18 Apr 2013 00:15:40 +0200 Subject: [PATCH 13/39] nohz: New APIs to re-evaluate the tick on full dynticks CPUs Provide two new helpers in order to notify the full dynticks CPUs about some internal system changes against which they may reconsider the state of their tick. Some practical examples include: posix cpu timers, perf tick and sched clock tick. For now the notifying handler, implemented through IPIs, is a stub that will be implemented when we get the tick stop/restart infrastructure in. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/linux/tick.h | 4 ++++ kernel/time/Kconfig | 1 + kernel/time/tick-sched.c | 51 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/include/linux/tick.h b/include/linux/tick.h index b4e3b0c9639e..c2dcfb18f65b 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -159,8 +159,12 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } #ifdef CONFIG_NO_HZ_FULL extern int tick_nohz_full_cpu(int cpu); +extern void tick_nohz_full_kick(void); +extern void tick_nohz_full_kick_all(void); #else static inline int tick_nohz_full_cpu(int cpu) { return 0; } +static inline void tick_nohz_full_kick(void) { } +static inline void tick_nohz_full_kick_all(void) { } #endif diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 358d601a4fec..fbb4c7eb92a0 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -111,6 +111,7 @@ config NO_HZ_FULL select RCU_USER_QS select RCU_NOCB_CPU select CONTEXT_TRACKING_FORCE + select IRQ_WORK help Adaptively try to shutdown the tick whenever possible, even when the CPU is running tasks. Typically this requires running a single diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 369b5769fc97..2bcad5b904d8 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -147,6 +147,57 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) static cpumask_var_t nohz_full_mask; bool have_nohz_full_mask; +/* + * Re-evaluate the need for the tick on the current CPU + * and restart it if necessary. + */ +static void tick_nohz_full_check(void) +{ + /* + * STUB for now, will be filled with the full tick stop/restart + * infrastructure patches + */ +} + +static void nohz_full_kick_work_func(struct irq_work *work) +{ + tick_nohz_full_check(); +} + +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { + .func = nohz_full_kick_work_func, +}; + +/* + * Kick the current CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick(void) +{ + if (tick_nohz_full_cpu(smp_processor_id())) + irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + +static void nohz_full_kick_ipi(void *info) +{ + tick_nohz_full_check(); +} + +/* + * Kick all full dynticks CPUs in order to force these to re-evaluate + * their dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_all(void) +{ + if (!have_nohz_full_mask) + return; + + preempt_disable(); + smp_call_function_many(nohz_full_mask, + nohz_full_kick_ipi, NULL, false); + preempt_enable(); +} + int tick_nohz_full_cpu(int cpu) { if (!have_nohz_full_mask) From 0453b435df0d69dd0d8c42eb9b3015aaf0d8a032 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 27 Mar 2013 02:18:34 +0100 Subject: [PATCH 14/39] nohz: Force boot CPU outside full dynticks range The timekeeping job must be able to run early on boot because there may be some pre-SMP (and thus pre-initcalls ) components that rely on it. The IO-APIC is one such users as it tests the timer health by watching jiffies progression. Given that it happens before we know the initial online set, we can't rely on it to select a timekeeper. We need one before SMP time otherwise we simply crash on boot. To fix this and keep things simple for now, force the boot CPU outside of the full dynticks range in any case and do this early on kernel parameter parsing time. We might want a trickier solution later, expecially for aSMP architectures that need to assign housekeeping tasks to arbitrary low power CPUs. But it's still first pass KISS time for now. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- Documentation/kernel-parameters.txt | 4 +-- kernel/time/tick-sched.c | 54 ++++++++--------------------- 2 files changed, 17 insertions(+), 41 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 82365dde00a8..887b29708447 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1916,8 +1916,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nohz_full= [KNL,BOOT] In kernels built with CONFIG_NO_HZ_FULL=y, set the specified list of CPUs whose tick will be stopped - whenever possible. You need to keep at least one online - CPU outside the range to maintain the timekeeping. + whenever possible. The boot CPU will be forced outside + the range to maintain the timekeeping. noiotrap [SH] Disables trapped I/O port accesses. diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 369b5769fc97..2bac5ea2c9af 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -158,11 +158,21 @@ int tick_nohz_full_cpu(int cpu) /* Parse the boot-time nohz CPU list from the kernel parameters. */ static int __init tick_nohz_full_setup(char *str) { + int cpu; + alloc_bootmem_cpumask_var(&nohz_full_mask); - if (cpulist_parse(str, nohz_full_mask) < 0) + if (cpulist_parse(str, nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); - else - have_nohz_full_mask = true; + return 1; + } + + cpu = smp_processor_id(); + if (cpumask_test_cpu(cpu, nohz_full_mask)) { + pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, nohz_full_mask); + } + have_nohz_full_mask = true; + return 1; } __setup("nohz_full=", tick_nohz_full_setup); @@ -195,42 +205,8 @@ static char __initdata nohz_full_buf[NR_CPUS + 1]; static int __init init_tick_nohz_full(void) { - cpumask_var_t online_nohz; - int cpu; - - if (!have_nohz_full_mask) - return 0; - - cpu_notifier(tick_nohz_cpu_down_callback, 0); - - if (!zalloc_cpumask_var(&online_nohz, GFP_KERNEL)) { - pr_warning("NO_HZ: Not enough memory to check full nohz mask\n"); - return -ENOMEM; - } - - /* - * CPUs can probably not be concurrently offlined on initcall time. - * But we are paranoid, aren't we? - */ - get_online_cpus(); - - /* Ensure we keep a CPU outside the dynticks range for timekeeping */ - cpumask_and(online_nohz, cpu_online_mask, nohz_full_mask); - if (cpumask_equal(online_nohz, cpu_online_mask)) { - pr_warning("NO_HZ: Must keep at least one online CPU " - "out of nohz_full range\n"); - /* - * We know the current CPU doesn't have its tick stopped. - * Let's use it for the timekeeping duty. - */ - preempt_disable(); - cpu = smp_processor_id(); - pr_warning("NO_HZ: Clearing %d from nohz_full range\n", cpu); - cpumask_clear_cpu(cpu, nohz_full_mask); - preempt_enable(); - } - put_online_cpus(); - free_cpumask_var(online_nohz); + if (have_nohz_full_mask) + cpu_notifier(tick_nohz_cpu_down_callback, 0); cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); From d1e43fa5f8bb25f83a86a29f11fcfb57ed4d7566 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Mar 2013 23:47:24 +0100 Subject: [PATCH 15/39] nohz: Ensure full dynticks CPUs are RCU nocbs We need full dynticks CPU to also be RCU nocb so that we don't have to keep the tick to handle RCU callbacks. Make sure the range passed to nohz_full= boot parameter is a subset of rcu_nocbs= The CPUs that fail to meet this requirement will be excluded from the nohz_full range. This is checked early in boot time, before any CPU has the opportunity to stop its tick. Suggested-by: Steven Rostedt Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- Documentation/kernel-parameters.txt | 2 ++ include/linux/rcupdate.h | 7 +++++++ include/linux/tick.h | 2 ++ init/main.c | 1 + kernel/rcutree.c | 6 +++--- kernel/rcutree.h | 1 - kernel/rcutree_plugin.h | 13 ++++--------- kernel/time/tick-sched.c | 22 ++++++++++++++++------ 8 files changed, 35 insertions(+), 19 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 887b29708447..4865e9bfd08d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1918,6 +1918,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. the specified list of CPUs whose tick will be stopped whenever possible. The boot CPU will be forced outside the range to maintain the timekeeping. + The CPUs in this range must also be included in the + rcu_nocbs= set. noiotrap [SH] Disables trapped I/O port accesses. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b758ce17b309..8e0948c872fc 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -999,4 +999,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu(ptr, rcu_head) \ __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) +#ifdef CONFIG_RCU_NOCB_CPU +extern bool rcu_is_nocb_cpu(int cpu); +#else +static inline bool rcu_is_nocb_cpu(int cpu) { return false; } +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/tick.h b/include/linux/tick.h index b4e3b0c9639e..0b6873cbf512 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -158,8 +158,10 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL +extern void tick_nohz_init(void); extern int tick_nohz_full_cpu(int cpu); #else +static inline void tick_nohz_init(void) { } static inline int tick_nohz_full_cpu(int cpu) { return 0; } #endif diff --git a/init/main.c b/init/main.c index 63534a141b4e..2acb5bbde99b 100644 --- a/init/main.c +++ b/init/main.c @@ -547,6 +547,7 @@ asmlinkage void __init start_kernel(void) idr_init_cache(); perf_event_init(); rcu_init(); + tick_nohz_init(); radix_tree_init(); /* init some links before init_ISA_irqs() */ early_irq_init(); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f5ab50235cba..1d4ceff793a4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1695,7 +1695,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* No-CBs CPUs do not have orphanable callbacks. */ - if (is_nocb_cpu(rdp->cpu)) + if (rcu_is_nocb_cpu(rdp->cpu)) return; /* @@ -2757,10 +2757,10 @@ static void _rcu_barrier(struct rcu_state *rsp) * corresponding CPU's preceding callbacks have been invoked. */ for_each_possible_cpu(cpu) { - if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) + if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) continue; rdp = per_cpu_ptr(rsp->rda, cpu); - if (is_nocb_cpu(cpu)) { + if (rcu_is_nocb_cpu(cpu)) { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, rsp->n_barrier_done); atomic_inc(&rsp->barrier_cpu_count); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index f993c0ac47db..38acc49da2c6 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -529,7 +529,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); -static bool is_nocb_cpu(int cpu); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy); static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a5745e9b5d5a..0cd91cc18db4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2167,7 +2167,7 @@ static int __init parse_rcu_nocb_poll(char *arg) early_param("rcu_nocb_poll", parse_rcu_nocb_poll); /* Is the specified CPU a no-CPUs CPU? */ -static bool is_nocb_cpu(int cpu) +bool rcu_is_nocb_cpu(int cpu) { if (have_rcu_nocb_mask) return cpumask_test_cpu(cpu, rcu_nocb_mask); @@ -2225,7 +2225,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy) { - if (!is_nocb_cpu(rdp->cpu)) + if (!rcu_is_nocb_cpu(rdp->cpu)) return 0; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); return 1; @@ -2242,7 +2242,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, long qll = rsp->qlen_lazy; /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ - if (!is_nocb_cpu(smp_processor_id())) + if (!rcu_is_nocb_cpu(smp_processor_id())) return 0; rsp->qlen = 0; rsp->qlen_lazy = 0; @@ -2282,7 +2282,7 @@ static bool nocb_cpu_expendable(int cpu) * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, * then offlining this CPU is harmless. Let it happen. */ - if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) + if (!have_rcu_nocb_mask || rcu_is_nocb_cpu(cpu)) return 1; /* If no memory, play it safe and keep the CPU around. */ @@ -2464,11 +2464,6 @@ static void __init rcu_init_nocb(void) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static bool is_nocb_cpu(int cpu) -{ - return false; -} - static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2bac5ea2c9af..d71a5f2bd7b2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -203,17 +203,27 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, */ static char __initdata nohz_full_buf[NR_CPUS + 1]; -static int __init init_tick_nohz_full(void) +void __init tick_nohz_init(void) { - if (have_nohz_full_mask) - cpu_notifier(tick_nohz_cpu_down_callback, 0); + int cpu; + + if (!have_nohz_full_mask) + return; + + cpu_notifier(tick_nohz_cpu_down_callback, 0); + + /* Make sure full dynticks CPU are also RCU nocbs */ + for_each_cpu(cpu, nohz_full_mask) { + if (!rcu_is_nocb_cpu(cpu)) { + pr_warning("NO_HZ: CPU %d is not RCU nocb: " + "cleared from nohz_full range", cpu); + cpumask_clear_cpu(cpu, nohz_full_mask); + } + } cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); - - return 0; } -core_initcall(init_tick_nohz_full); #else #define have_nohz_full_mask (0) #endif From f98823ac758ba1aa77c6e3f8ad4ef3ad84ee0a7c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 27 Mar 2013 01:17:22 +0100 Subject: [PATCH 16/39] nohz: New option to default all CPUs in full dynticks range Provide a new kernel config that defaults all CPUs to be part of the full dynticks range, except the boot one for timekeeping. This default setting is overriden by the nohz_full= boot option if passed by the user. This is helpful for those who don't need a finegrained range of full dynticks CPU and also for automated testing. Suggested-by: Ingo Molnar Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/time/Kconfig | 10 ++++++++++ kernel/time/tick-sched.c | 23 +++++++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 358d601a4fec..99c3f13dd478 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -128,6 +128,16 @@ config NO_HZ_FULL endchoice +config NO_HZ_FULL_ALL + bool "Full dynticks system on all CPUs by default" + depends on NO_HZ_FULL + help + If the user doesn't pass the nohz_full boot option to + define the range of full dynticks CPUs, consider that all + CPUs in the system are full dynticks by default. + Note the boot CPU will still be kept outside the range to + handle the timekeeping duty. + config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d71a5f2bd7b2..a76e09044f9f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -203,12 +203,31 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, */ static char __initdata nohz_full_buf[NR_CPUS + 1]; +static int tick_nohz_init_all(void) +{ + int err = -1; + +#ifdef CONFIG_NO_HZ_FULL_ALL + if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { + pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); + return err; + } + err = 0; + cpumask_setall(nohz_full_mask); + cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); + have_nohz_full_mask = true; +#endif + return err; +} + void __init tick_nohz_init(void) { int cpu; - if (!have_nohz_full_mask) - return; + if (!have_nohz_full_mask) { + if (tick_nohz_init_all() < 0) + return; + } cpu_notifier(tick_nohz_cpu_down_callback, 0); From a85721601ad22ca4931e106dbc5b2cbf1d28bdea Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 18 Apr 2013 01:31:13 +0200 Subject: [PATCH 17/39] posix_timers: Kick full dynticks CPUs when a posix cpu timer is armed Kick the full dynticks CPUs when a posix cpu timer is enqueued by way of a standard call to posix_cpu_timer_set() or set_process_cpu_timer(). This also include rescheduled firing timers. This way they can re-evaluate the state of (and possibly restart) their tick against the new expiry modification. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/posix-cpu-timers.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 8fd709c9bb58..0bc33561a435 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include /* * Called after updating RLIMIT_CPU to run cpu timer and update @@ -636,6 +638,26 @@ static int cpu_timer_sample_group(const clockid_t which_clock, return 0; } +#ifdef CONFIG_NO_HZ_FULL +static void nohz_kick_work_fn(struct work_struct *work) +{ + tick_nohz_full_kick_all(); +} + +static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); + +/* + * We need the IPIs to be sent from sane process context. + * The posix cpu timers are always set with irqs disabled. + */ +static void posix_cpu_timer_kick_nohz(void) +{ + schedule_work(&nohz_kick_work); +} +#else +static inline void posix_cpu_timer_kick_nohz(void) { } +#endif + /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -794,6 +816,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, sample_to_timespec(timer->it_clock, old_incr, &old->it_interval); } + if (!ret) + posix_cpu_timer_kick_nohz(); return ret; } @@ -1336,6 +1360,13 @@ void run_posix_cpu_timers(struct task_struct *tsk) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } + + /* + * In case some timers were rescheduled after the queue got emptied, + * wake up full dynticks CPUs. + */ + if (tsk->signal->cputimer.running) + posix_cpu_timer_kick_nohz(); } /* @@ -1366,7 +1397,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } if (!*newval) - return; + goto out; *newval += now.cpu; } @@ -1384,6 +1415,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, tsk->signal->cputime_expires.virt_exp = *newval; break; } +out: + posix_cpu_timer_kick_nohz(); } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, From 555347f6c080d2f25265f981c963605b4dd3610d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 19 Apr 2013 16:17:38 +0200 Subject: [PATCH 18/39] posix_timers: New API to prevent from stopping the tick when timers are running Bring a new helper that the full dynticks infrastructure can call in order to know if it can safely stop the tick from the posix cpu timers subsystem point of view. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/linux/posix-timers.h | 2 ++ kernel/posix-cpu-timers.c | 41 +++++++++++++++++++++++------------- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 042058fdb0af..3698d9d08978 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -122,6 +122,8 @@ void run_posix_cpu_timers(struct task_struct *task); void posix_cpu_timers_exit(struct task_struct *task); void posix_cpu_timers_exit_group(struct task_struct *task); +bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk); + void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 0bc33561a435..84d5cb372ed5 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -155,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer, } } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) + return 1; + return 0; +} + static inline cputime_t prof_ticks(struct task_struct *p) { cputime_t utime, stime; @@ -654,6 +669,17 @@ static void posix_cpu_timer_kick_nohz(void) { schedule_work(&nohz_kick_work); } + +bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) +{ + if (!task_cputime_zero(&tsk->cputime_expires)) + return true; + + if (tsk->signal->cputimer.running) + return true; + + return false; +} #else static inline void posix_cpu_timer_kick_nohz(void) { } #endif @@ -1032,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, } } -/** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime: The struct to compare. - * - * Checks @cputime to see if all fields are zero. Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) -{ - if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) - return 1; - return 0; -} - /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers From 6ac29178b4fe8e7c0139375008f014ceb466039d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 21 Apr 2013 20:28:38 +0200 Subject: [PATCH 19/39] posix_timers: Fix pre-condition to stop the tick on full dynticks The test that checks if a CPU can stop its tick from posix CPU timers angle was mistakenly inverted. What we want is to prevent the tick from being stopped as long as the current CPU's task runs a posix CPU timer. Fix this. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/posix-cpu-timers.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 84d5cb372ed5..42670e9b44e0 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -673,12 +673,12 @@ static void posix_cpu_timer_kick_nohz(void) bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) { if (!task_cputime_zero(&tsk->cputime_expires)) - return true; + return false; if (tsk->signal->cputimer.running) - return true; + return false; - return false; + return true; } #else static inline void posix_cpu_timer_kick_nohz(void) { } From 12351ef8f9f2226636b00324d841d9c5069d80bc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 15:48:22 +0200 Subject: [PATCH 20/39] perf: Kick full dynticks CPU if events rotation is needed Kick the current CPU's tick by sending it a self IPI when an event is queued on the rotation list and it is the first element inserted. This makes sure that perf_event_task_tick() works on full dynticks CPUs. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Stephane Eranian Cc: Jiri Olsa --- kernel/events/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index b0cd86501c30..75b58bb75b32 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -655,8 +656,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu) WARN_ON(!irqs_disabled()); - if (list_empty(&cpuctx->rotation_list)) + if (list_empty(&cpuctx->rotation_list)) { + int was_empty = list_empty(head); list_add(&cpuctx->rotation_list, head); + if (was_empty) + tick_nohz_full_kick(); + } } static void get_ctx(struct perf_event_context *ctx) From 026249ef100b5384b6c74c360db46728e98354da Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 15:58:34 +0200 Subject: [PATCH 21/39] perf: New helper to prevent full dynticks CPUs from stopping tick Provide a new helper that help full dynticks CPUs to prevent from stopping their tick in case there are events in the local rotation list. This way we make sure that perf_event_task_tick() is serviced on demand. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Stephane Eranian Cc: Jiri Olsa --- include/linux/perf_event.h | 6 ++++++ kernel/events/core.c | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e47ee462c2f2..0140830225e2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -799,6 +799,12 @@ static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } #endif +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL) +extern bool perf_event_can_stop_tick(void); +#else +static inline bool perf_event_can_stop_tick(void) { return true; } +#endif + #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 75b58bb75b32..ddb993b52190 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2560,6 +2560,16 @@ done: list_del_init(&cpuctx->rotation_list); } +#ifdef CONFIG_NO_HZ_FULL +bool perf_event_can_stop_tick(void) +{ + if (list_empty(&__get_cpu_var(rotation_list))) + return true; + else + return false; +} +#endif + void perf_event_task_tick(void) { struct list_head *head = &__get_cpu_var(rotation_list); From 9f3660c2c1a221c886474587103c69f6034d3e4f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 14:35:09 +0200 Subject: [PATCH 22/39] sched: Kick full dynticks CPU that have more than one task enqueued. Kick the tick on full dynticks CPUs when they get more than one task running on their queue. This makes sure that local fairness is maintained by the tick on the destination. This is done regardless of these tasks' class. We should be able to be more clever in the future depending on these. eg: a CPU that runs a SCHED_FIFO task doesn't need to maintain fairness against local pending tasks of the fair class. But keep things simple for now. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/sched/sched.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 889904dd6d77..eb363aa5d83c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #include #include #include +#include #include "cpupri.h" @@ -1106,6 +1107,16 @@ static inline u64 steal_ticks(u64 steal) static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; + +#ifdef CONFIG_NO_HZ_FULL + if (rq->nr_running == 2) { + if (tick_nohz_full_cpu(rq->cpu)) { + /* Order rq->nr_running write against the IPI */ + smp_wmb(); + smp_send_reschedule(rq->cpu); + } + } +#endif } static inline void dec_nr_running(struct rq *rq) From ce831b38ca4920739a7a5b0c73b921da41f03718 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 15:15:35 +0200 Subject: [PATCH 23/39] sched: New helper to prevent from stopping the tick in full dynticks Provide a new helper to be called from the full dynticks engine before stopping the tick in order to make sure we don't stop it when there is more than one task running on the CPU. This way we make sure that the tick stays alive to maintain fairness. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/linux/sched.h | 6 ++++++ kernel/sched/core.c | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1ff9e0a5de27..a74adedcdd10 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1856,6 +1856,12 @@ extern void wake_up_nohz_cpu(int cpu); static inline void wake_up_nohz_cpu(int cpu) { } #endif +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(void); +#else +static inline bool sched_can_stop_tick(void) { return false; } +#endif + #ifdef CONFIG_SCHED_AUTOGROUP extern void sched_autogroup_create_attach(struct task_struct *p); extern void sched_autogroup_detach(struct task_struct *p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0f0a5b3fd62c..69f71335984f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -650,6 +650,24 @@ static inline bool got_nohz_idle_kick(void) #endif /* CONFIG_NO_HZ_COMMON */ +#ifdef CONFIG_NO_HZ_FULL +bool sched_can_stop_tick(void) +{ + struct rq *rq; + + rq = this_rq(); + + /* Make sure rq->nr_running update is visible after the IPI */ + smp_rmb(); + + /* More than one running task need preemption */ + if (rq->nr_running > 1) + return false; + + return true; +} +#endif /* CONFIG_NO_HZ_FULL */ + void sched_avg_update(struct rq *rq) { s64 period = sched_avg_period(); From ff442c51f6543378cf23107c75b7949dc64a9119 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 15:27:08 +0200 Subject: [PATCH 24/39] nohz: Re-evaluate the tick from the scheduler IPI The scheduler IPI is used by the scheduler to kick full dynticks CPUs asynchronously when more than one task are running or when a new timer list timer is enqueued. This way the destination CPU can decide to restart the tick to handle this new situation. Now let's call that kick in the scheduler IPI. (Reusing the scheduler IPI rather than implementing a new IPI was suggested by Peter Zijlstra a while ago) Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/linux/tick.h | 2 ++ kernel/sched/core.c | 4 +++- kernel/time/tick-sched.c | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index d290168335bc..e31e67623ea1 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -160,11 +160,13 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); extern int tick_nohz_full_cpu(int cpu); +extern void tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_all(void); #else static inline void tick_nohz_init(void) { } static inline int tick_nohz_full_cpu(int cpu) { return 0; } +static inline void tick_nohz_full_check(void) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 69f71335984f..9ad35005f1cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1398,7 +1398,8 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() + && !tick_nohz_full_cpu(smp_processor_id())) return; /* @@ -1415,6 +1416,7 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); + tick_nohz_full_check(); sched_ttwu_pending(); /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 884a9f302a06..4d74a68b2c34 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -151,7 +151,7 @@ bool have_nohz_full_mask; * Re-evaluate the need for the tick on the current CPU * and restart it if necessary. */ -static void tick_nohz_full_check(void) +void tick_nohz_full_check(void) { /* * STUB for now, will be filled with the full tick stop/restart From 9014c45d9e2dbb935498a5f1d106e220e8624643 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 15:43:57 +0200 Subject: [PATCH 25/39] nohz: Implement full dynticks kick Implement the full dynticks kick that is performed from IPIs sent by various subsystems (scheduler, posix timers, ...) when they want to notify about a new event that may reconsider the dependency on the tick. Most of the time, such an event end up restarting the tick. (Part of the design with subsystems providing *_can_stop_tick() helpers suggested by Peter Zijlstra a while ago). Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 42 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4d74a68b2c34..95d79aeb3e27 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include @@ -147,16 +149,48 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) static cpumask_var_t nohz_full_mask; bool have_nohz_full_mask; +static bool can_stop_full_tick(void) +{ + WARN_ON_ONCE(!irqs_disabled()); + + if (!sched_can_stop_tick()) + return false; + + if (!posix_cpu_timers_can_stop_tick(current)) + return false; + + if (!perf_event_can_stop_tick()) + return false; + + /* sched_clock_tick() needs us? */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + /* + * TODO: kick full dynticks CPUs when + * sched_clock_stable is set. + */ + if (!sched_clock_stable) + return false; +#endif + + return true; +} + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); + /* * Re-evaluate the need for the tick on the current CPU * and restart it if necessary. */ void tick_nohz_full_check(void) { - /* - * STUB for now, will be filled with the full tick stop/restart - * infrastructure patches - */ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (tick_nohz_full_cpu(smp_processor_id())) { + if (ts->tick_stopped && !is_idle_task(current)) { + if (!can_stop_full_tick()) + tick_nohz_restart_sched_tick(ts, ktime_get()); + } + } } static void nohz_full_kick_work_func(struct irq_work *work) From 5811d9963e26146898a24b535b301f7654257f8a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 16:40:31 +0200 Subject: [PATCH 26/39] nohz: Prepare to stop the tick on irq exit Interrupt exit is a natural place to stop the tick: it happens after all events happening before and during the irq which are liable to update the dependency on the tick occured. Also it makes sure that any check on tick dependency is well ordered against dynticks kick IPIs. Bring in the infrastructure that performs the tick dependency checks on irq exit and shut it down if these checks show that we can do it safely. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/time/tick-sched.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 95d79aeb3e27..d0ed1905a85c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -647,6 +647,24 @@ out: return ret; } +static void tick_nohz_full_stop_tick(struct tick_sched *ts) +{ +#ifdef CONFIG_NO_HZ_FULL + int cpu = smp_processor_id(); + + if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) + return; + + if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) + return; + + if (!can_stop_full_tick()) + return; + + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); +#endif +} + static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* @@ -773,12 +791,13 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - if (!ts->inidle) - return; - - /* Cancel the timer because CPU already waken up from the C-states*/ - menu_hrtimer_cancel(); - __tick_nohz_idle_enter(ts); + if (ts->inidle) { + /* Cancel the timer because CPU already waken up from the C-states*/ + menu_hrtimer_cancel(); + __tick_nohz_idle_enter(ts); + } else { + tick_nohz_full_stop_tick(ts); + } } /** From 99e5ada9407cc19d7c4c05ce2165f20dc46fc093 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 17:11:50 +0200 Subject: [PATCH 27/39] nohz: Re-evaluate the tick for the new task after a context switch When a task is scheduled in, it may have some properties of its own that could make the CPU reconsider the need for the tick: posix cpu timers, perf events, ... So notify the full dynticks subsystem when a task gets scheduled in and re-check the tick dependency at this stage. This is done through a self IPI to avoid messing up with any current lock scenario. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/linux/tick.h | 2 ++ kernel/sched/core.c | 2 ++ kernel/time/tick-sched.c | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/include/linux/tick.h b/include/linux/tick.h index e31e67623ea1..9180f4b85e6d 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -163,12 +163,14 @@ extern int tick_nohz_full_cpu(int cpu); extern void tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_all(void); +extern void tick_nohz_task_switch(struct task_struct *tsk); #else static inline void tick_nohz_init(void) { } static inline int tick_nohz_full_cpu(int cpu) { return 0; } static inline void tick_nohz_full_check(void) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } +static inline void tick_nohz_task_switch(struct task_struct *tsk) { } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9ad35005f1cb..dd09def88567 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1896,6 +1896,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } + + tick_nohz_task_switch(current); } #ifdef CONFIG_SMP diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d0ed1905a85c..12a900dbb819 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -232,6 +232,26 @@ void tick_nohz_full_kick_all(void) preempt_enable(); } +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix cpu timers, ... + */ +void tick_nohz_task_switch(struct task_struct *tsk) +{ + unsigned long flags; + + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + local_irq_save(flags); + + if (tick_nohz_tick_stopped() && !can_stop_full_tick()) + tick_nohz_full_kick(); + + local_irq_restore(flags); +} + int tick_nohz_full_cpu(int cpu) { if (!have_nohz_full_mask) From 67826eae8c16dbf00c262be6ec15021bb42f69c4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 17:43:13 +0200 Subject: [PATCH 28/39] nohz: Disable the tick when irq resume in full dynticks CPU Eventually try to disable tick on irq exit, now that the fundamental infrastructure is in place. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/softirq.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index de15813f2a66..8b1446d4a4db 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -337,6 +337,19 @@ static inline void invoke_softirq(void) } } +static inline void tick_irq_exit(void) +{ +#ifdef CONFIG_NO_HZ_COMMON + int cpu = smp_processor_id(); + + /* Make sure that timer wheel updates are propagated */ + if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { + if (!in_interrupt()) + tick_nohz_irq_exit(); + } +#endif +} + /* * Exit an interrupt context. Process softirqs if needed and possible: */ @@ -348,11 +361,7 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); -#ifdef CONFIG_NO_HZ_COMMON - /* Make sure that timer wheel updates are propagated */ - if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) - tick_nohz_irq_exit(); -#endif + tick_irq_exit(); rcu_irq_exit(); sched_preempt_enable_no_resched(); } From 0637e029392386e6996f5d6574aadccee8315efa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 22 Apr 2013 21:15:59 +0200 Subject: [PATCH 29/39] nohz: Select wide RCU nocb for full dynticks It makes testing and implementation much easier as we know in advance that all CPUs are RCU nocbs. Also this prepares to remove the dynamic check for nohz_full= boot mask to be a subset of rcu_nocbs= Eventually this should also help removing the requirement for the boot CPU to be outside the full dynticks range. Suggested-by: Christoph Lameter Suggested-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/time/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f6a792ab4983..e1ac129b01c9 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -110,6 +110,7 @@ config NO_HZ_FULL select NO_HZ_COMMON select RCU_USER_QS select RCU_NOCB_CPU + select RCU_NOCB_CPU_ALL select CONTEXT_TRACKING_FORCE select IRQ_WORK help From cb41a29076e9f95547da46578d5c8804f7b8845d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 17:35:50 +0200 Subject: [PATCH 30/39] nohz: Add basic tracing It's not obvious to find out why the full dynticks subsystem doesn't always stop the tick: whether this is due to kthreads, posix timers, perf events, etc... These new tracepoints are here to help the user diagnose the failures and test this feature. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/trace/events/timer.h | 21 +++++++++++++++++++++ kernel/time/tick-sched.c | 19 +++++++++++++++---- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 425bcfe56c62..f5eb53eb658f 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -323,6 +323,27 @@ TRACE_EVENT(itimer_expire, (int) __entry->pid, (unsigned long long)__entry->now) ); +#ifdef CONFIG_NO_HZ_FULL +TRACE_EVENT(tick_stop, + + TP_PROTO(int success, char *error_msg), + + TP_ARGS(success, error_msg), + + TP_STRUCT__entry( + __field( int , success ) + __string( msg, error_msg ) + ), + + TP_fast_assign( + __entry->success = success; + __assign_str(msg, error_msg); + ), + + TP_printk("success=%s msg=%s", __entry->success ? "yes" : "no", __get_str(msg)) +); +#endif + #endif /* _TRACE_TIMER_H */ /* This part must be outside protection */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 12a900dbb819..85e05ab98253 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -28,6 +28,8 @@ #include "tick-internal.h" +#include + /* * Per cpu nohz control structure */ @@ -153,14 +155,20 @@ static bool can_stop_full_tick(void) { WARN_ON_ONCE(!irqs_disabled()); - if (!sched_can_stop_tick()) + if (!sched_can_stop_tick()) { + trace_tick_stop(0, "more than 1 task in runqueue\n"); return false; + } - if (!posix_cpu_timers_can_stop_tick(current)) + if (!posix_cpu_timers_can_stop_tick(current)) { + trace_tick_stop(0, "posix timers running\n"); return false; + } - if (!perf_event_can_stop_tick()) + if (!perf_event_can_stop_tick()) { + trace_tick_stop(0, "perf events running\n"); return false; + } /* sched_clock_tick() needs us? */ #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK @@ -168,8 +176,10 @@ static bool can_stop_full_tick(void) * TODO: kick full dynticks CPUs when * sched_clock_stable is set. */ - if (!sched_clock_stable) + if (!sched_clock_stable) { + trace_tick_stop(0, "unstable sched clock\n"); return false; + } #endif return true; @@ -631,6 +641,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; + trace_tick_stop(1, " "); } /* From 2c82d1be4d9d70b5e3da6e179bb033bd53e95299 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 17:35:50 +0200 Subject: [PATCH 31/39] nohz: Fix unavailable tick_stop tracepoint in dynticks idle The trace_tick_stop() tracepoint is only available in full dynticks. But it's also used by dynticks-idle so let's build it for the latter config as well. This fixes: kernel/time/tick-sched.c: In function tick_nohz_stop_sched_tick: kernel/time/tick-sched.c:644: error: implicit declaration of function trace_tick_stop make[2]: *** [kernel/time/tick-sched.o] Erreur 1 Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/trace/events/timer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index f5eb53eb658f..e967dd8a34c6 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -323,7 +323,7 @@ TRACE_EVENT(itimer_expire, (int) __entry->pid, (unsigned long long)__entry->now) ); -#ifdef CONFIG_NO_HZ_FULL +#ifdef CONFIG_NO_HZ_COMMON TRACE_EVENT(tick_stop, TP_PROTO(int success, char *error_msg), From 65e709dc0c25dbd563861924815e9a3a93878b75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 17:35:50 +0200 Subject: [PATCH 32/39] nohz: Remove full dynticks' superfluous dependency on RCU tree Remove the dependency on (TREE_RCU || TREE_PREEMPT_RCU). The full dynticks option already depends on SMP which implies (whatever flavour of) RCU tree config anyway. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/time/Kconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e1ac129b01c9..1ea2bba4a686 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -104,8 +104,6 @@ config NO_HZ_FULL depends on SMP # RCU_USER_QS dependency depends on HAVE_CONTEXT_TRACKING - # RCU_NOCB_CPU dependency - depends on TREE_RCU || TREE_PREEMPT_RCU depends on VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON select RCU_USER_QS From 47aa8b6cbcb839efe2edaa5b50fee21df115d37b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 26 Apr 2013 10:05:59 +0200 Subject: [PATCH 33/39] nohz: Reduce overhead under high-freq idling patterns One testbox of mine (Intel Nehalem, 16-way) uses MWAIT for its idle routine, which apparently can break out of its idle loop rather frequently, with high frequency. In that case NO_HZ_FULL=y kernels show high ksoftirqd overhead and constant context switching, because tick_nohz_stop_sched_tick() will, if delta_jiffies == 0, mis-identify this as a timer event - activating the TIMER_SOFTIRQ, which wakes up ksoftirqd. Fix this by treating delta_jiffies == 0 the same way we treat other short wakeups, delta_jiffies == 1. Cc: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 85e05ab98253..da53c8f2beb5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -565,11 +565,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, delta_jiffies = rcu_delta_jiffies; } } + /* - * Do not stop the tick, if we are only one off - * or if the cpu is required for rcu + * Do not stop the tick, if we are only one off (or less) + * or if the cpu is required for RCU: */ - if (!ts->tick_stopped && delta_jiffies == 1) + if (!ts->tick_stopped && delta_jiffies <= 1) goto out; /* Schedule the tick, if we are at least one jiffie off */ From c58b0df12a6b5c497637db0676effd00e1fbab13 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 26 Apr 2013 15:16:31 +0200 Subject: [PATCH 34/39] nohz: Select VIRT_CPU_ACCOUNTING_GEN from full dynticks config Turn the full dynticks passive dependency on VIRT_CPU_ACCOUNTING_GEN to an active one. The full dynticks Kconfig is currently hidden behind the full dynticks cputime accounting, which is an awkward and counter-intuitive layout: the user first has to select the dynticks cputime accounting in order to make the full dynticks feature to be visible. We definetly want it the other way around. The usual way to perform this kind of active dependency is use "select" on the depended target. Now we can't use the Kconfig "select" instruction when the target is a "choice". So this patch inspires on how the RCU subsystem Kconfig interact with its dependencies on SMP and PREEMPT: we make sure that cputime accounting can't propose another option than VIRT_CPU_ACCOUNTING_GEN when NO_HZ_FULL is selected by using the right "depends on" instruction for each cputime accounting choices. v2: Keep full dynticks cputime accounting available even without full dynticks, as per Paul McKenney's suggestion. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Christoph Lameter Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- init/Kconfig | 6 +++--- kernel/time/Kconfig | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index edc8132584f1..8f97a7407714 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -306,7 +306,7 @@ choice # Kind of a stub config for the pure tick based cputime accounting config TICK_CPU_ACCOUNTING bool "Simple tick based cputime accounting" - depends on !S390 + depends on !S390 && !NO_HZ_FULL help This is the basic tick based cputime accounting that maintains statistics about user, system and idle time spent on per jiffies @@ -316,7 +316,7 @@ config TICK_CPU_ACCOUNTING config VIRT_CPU_ACCOUNTING_NATIVE bool "Deterministic task and CPU time accounting" - depends on HAVE_VIRT_CPU_ACCOUNTING + depends on HAVE_VIRT_CPU_ACCOUNTING && !NO_HZ_FULL select VIRT_CPU_ACCOUNTING help Select this option to enable more accurate task and CPU time @@ -346,7 +346,7 @@ config VIRT_CPU_ACCOUNTING_GEN config IRQ_TIME_ACCOUNTING bool "Fine granularity task level IRQ time accounting" - depends on HAVE_IRQ_TIME_ACCOUNTING + depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL help Select this option to enable fine granularity task irq time accounting. This is done by reading a timestamp on each diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 1ea2bba4a686..a2ddd650cb92 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -104,11 +104,13 @@ config NO_HZ_FULL depends on SMP # RCU_USER_QS dependency depends on HAVE_CONTEXT_TRACKING - depends on VIRT_CPU_ACCOUNTING_GEN + # VIRT_CPU_ACCOUNTING_GEN dependency + depends on 64BIT select NO_HZ_COMMON select RCU_USER_QS select RCU_NOCB_CPU select RCU_NOCB_CPU_ALL + select VIRT_CPU_ACCOUNTING_GEN select CONTEXT_TRACKING_FORCE select IRQ_WORK help From 8c23b80ec7f1f5405f07bb56c2f8378800ecf401 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Wed, 20 Mar 2013 16:34:25 -0700 Subject: [PATCH 35/39] cputime_nsecs: use math64.h for nsec resolution conversion helpers For the nsec resolution conversions to be useable on non 64-bit architectures, the helpers in need to be used so the right arch-specific 64-bit math helpers can be used (e.g. do_div()) Signed-off-by: Kevin Hilman Cc: Christoph Lameter Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- include/asm-generic/cputime_nsecs.h | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index a8ece9a33aef..2c9e62c2bfd0 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -16,21 +16,27 @@ #ifndef _ASM_GENERIC_CPUTIME_NSECS_H #define _ASM_GENERIC_CPUTIME_NSECS_H +#include + typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; #define cputime_one_jiffy jiffies_to_cputime(1) +#define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor) +#define cputime_div_rem(__ct, divisor, remainder) \ + div_u64_rem((__force u64)__ct, divisor, remainder); + /* * Convert cputime <-> jiffies (HZ) */ #define cputime_to_jiffies(__ct) \ - ((__force u64)(__ct) / (NSEC_PER_SEC / HZ)) + cputime_div(__ct, NSEC_PER_SEC / HZ) #define cputime_to_scaled(__ct) (__ct) #define jiffies_to_cputime(__jif) \ (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) #define cputime64_to_jiffies64(__ct) \ - ((__force u64)(__ct) / (NSEC_PER_SEC / HZ)) + cputime_div(__ct, NSEC_PER_SEC / HZ) #define jiffies64_to_cputime64(__jif) \ (__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ)) @@ -45,7 +51,7 @@ typedef u64 __nocast cputime64_t; * Convert cputime <-> microseconds */ #define cputime_to_usecs(__ct) \ - ((__force u64)(__ct) / NSEC_PER_USEC) + cputime_div(__ct, NSEC_PER_USEC) #define usecs_to_cputime(__usecs) \ (__force cputime_t)((__usecs) * NSEC_PER_USEC) #define usecs_to_cputime64(__usecs) \ @@ -55,7 +61,7 @@ typedef u64 __nocast cputime64_t; * Convert cputime <-> seconds */ #define cputime_to_secs(__ct) \ - ((__force u64)(__ct) / NSEC_PER_SEC) + cputime_div(__ct, NSEC_PER_SEC) #define secs_to_cputime(__secs) \ (__force cputime_t)((__secs) * NSEC_PER_SEC) @@ -69,8 +75,10 @@ static inline cputime_t timespec_to_cputime(const struct timespec *val) } static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) { - val->tv_sec = (__force u64) ct / NSEC_PER_SEC; - val->tv_nsec = (__force u64) ct % NSEC_PER_SEC; + u32 rem; + + val->tv_sec = cputime_div_rem(ct, NSEC_PER_SEC, &rem); + val->tv_nsec = rem; } /* @@ -83,15 +91,17 @@ static inline cputime_t timeval_to_cputime(const struct timeval *val) } static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) { - val->tv_sec = (__force u64) ct / NSEC_PER_SEC; - val->tv_usec = ((__force u64) ct % NSEC_PER_SEC) / NSEC_PER_USEC; + u32 rem; + + val->tv_sec = cputime_div_rem(ct, NSEC_PER_SEC, &rem); + val->tv_usec = rem / NSEC_PER_USEC; } /* * Convert cputime <-> clock (USER_HZ) */ #define cputime_to_clock_t(__ct) \ - ((__force u64)(__ct) / (NSEC_PER_SEC / USER_HZ)) + cputime_div(__ct, (NSEC_PER_SEC / USER_HZ)) #define clock_t_to_cputime(__x) \ (__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ)) From 0c87f9b5ca5bdda1a868b0d7df4bec92e41a468d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Mar 2013 16:27:31 -0700 Subject: [PATCH 36/39] nohz_full: Add documentation. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Borislav Petkov Cc: Arjan van de Ven Cc: Kevin Hilman Cc: Christoph Lameter Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Olivier Baetz Reviewed-by: Randy Dunlap Reviewed-by: Borislav Petkov Reviewed-by: Kevin Hilman --- Documentation/timers/NO_HZ.txt | 273 +++++++++++++++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 Documentation/timers/NO_HZ.txt diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt new file mode 100644 index 000000000000..5b5322024067 --- /dev/null +++ b/Documentation/timers/NO_HZ.txt @@ -0,0 +1,273 @@ + NO_HZ: Reducing Scheduling-Clock Ticks + + +This document describes Kconfig options and boot parameters that can +reduce the number of scheduling-clock interrupts, thereby improving energy +efficiency and reducing OS jitter. Reducing OS jitter is important for +some types of computationally intensive high-performance computing (HPC) +applications and for real-time applications. + +There are two main contexts in which the number of scheduling-clock +interrupts can be reduced compared to the old-school approach of sending +a scheduling-clock interrupt to all CPUs every jiffy whether they need +it or not (CONFIG_HZ_PERIODIC=y or CONFIG_NO_HZ=n for older kernels): + +1. Idle CPUs (CONFIG_NO_HZ_IDLE=y or CONFIG_NO_HZ=y for older kernels). + +2. CPUs having only one runnable task (CONFIG_NO_HZ_FULL=y). + +These two cases are described in the following two sections, followed +by a third section on RCU-specific considerations and a fourth and final +section listing known issues. + + +IDLE CPUs + +If a CPU is idle, there is little point in sending it a scheduling-clock +interrupt. After all, the primary purpose of a scheduling-clock interrupt +is to force a busy CPU to shift its attention among multiple duties, +and an idle CPU has no duties to shift its attention among. + +The CONFIG_NO_HZ_IDLE=y Kconfig option causes the kernel to avoid sending +scheduling-clock interrupts to idle CPUs, which is critically important +both to battery-powered devices and to highly virtualized mainframes. +A battery-powered device running a CONFIG_HZ_PERIODIC=y kernel would +drain its battery very quickly, easily 2-3 times as fast as would the +same device running a CONFIG_NO_HZ_IDLE=y kernel. A mainframe running +1,500 OS instances might find that half of its CPU time was consumed by +unnecessary scheduling-clock interrupts. In these situations, there +is strong motivation to avoid sending scheduling-clock interrupts to +idle CPUs. That said, dyntick-idle mode is not free: + +1. It increases the number of instructions executed on the path + to and from the idle loop. + +2. On many architectures, dyntick-idle mode also increases the + number of expensive clock-reprogramming operations. + +Therefore, systems with aggressive real-time response constraints often +run CONFIG_HZ_PERIODIC=y kernels (or CONFIG_NO_HZ=n for older kernels) +in order to avoid degrading from-idle transition latencies. + +An idle CPU that is not receiving scheduling-clock interrupts is said to +be "dyntick-idle", "in dyntick-idle mode", "in nohz mode", or "running +tickless". The remainder of this document will use "dyntick-idle mode". + +There is also a boot parameter "nohz=" that can be used to disable +dyntick-idle mode in CONFIG_NO_HZ_IDLE=y kernels by specifying "nohz=off". +By default, CONFIG_NO_HZ_IDLE=y kernels boot with "nohz=on", enabling +dyntick-idle mode. + + +CPUs WITH ONLY ONE RUNNABLE TASK + +If a CPU has only one runnable task, there is little point in sending it +a scheduling-clock interrupt because there is no other task to switch to. + +The CONFIG_NO_HZ_FULL=y Kconfig option causes the kernel to avoid +sending scheduling-clock interrupts to CPUs with a single runnable task, +and such CPUs are said to be "adaptive-ticks CPUs". This is important +for applications with aggressive real-time response constraints because +it allows them to improve their worst-case response times by the maximum +duration of a scheduling-clock interrupt. It is also important for +computationally intensive short-iteration workloads: If any CPU is +delayed during a given iteration, all the other CPUs will be forced to +wait idle while the delayed CPU finishes. Thus, the delay is multiplied +by one less than the number of CPUs. In these situations, there is +again strong motivation to avoid sending scheduling-clock interrupts. + +By default, no CPU will be an adaptive-ticks CPU. The "nohz_full=" +boot parameter specifies the adaptive-ticks CPUs. For example, +"nohz_full=1,6-8" says that CPUs 1, 6, 7, and 8 are to be adaptive-ticks +CPUs. Note that you are prohibited from marking all of the CPUs as +adaptive-tick CPUs: At least one non-adaptive-tick CPU must remain +online to handle timekeeping tasks in order to ensure that system calls +like gettimeofday() returns accurate values on adaptive-tick CPUs. +(This is not an issue for CONFIG_NO_HZ_IDLE=y because there are no +running user processes to observe slight drifts in clock rate.) +Therefore, the boot CPU is prohibited from entering adaptive-ticks +mode. Specifying a "nohz_full=" mask that includes the boot CPU will +result in a boot-time error message, and the boot CPU will be removed +from the mask. + +Alternatively, the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter specifies +that all CPUs other than the boot CPU are adaptive-ticks CPUs. This +Kconfig parameter will be overridden by the "nohz_full=" boot parameter, +so that if both the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter and +the "nohz_full=1" boot parameter is specified, the boot parameter will +prevail so that only CPU 1 will be an adaptive-ticks CPU. + +Finally, adaptive-ticks CPUs must have their RCU callbacks offloaded. +This is covered in the "RCU IMPLICATIONS" section below. + +Normally, a CPU remains in adaptive-ticks mode as long as possible. +In particular, transitioning to kernel mode does not automatically change +the mode. Instead, the CPU will exit adaptive-ticks mode only if needed, +for example, if that CPU enqueues an RCU callback. + +Just as with dyntick-idle mode, the benefits of adaptive-tick mode do +not come for free: + +1. CONFIG_NO_HZ_FULL selects CONFIG_NO_HZ_COMMON, so you cannot run + adaptive ticks without also running dyntick idle. This dependency + extends down into the implementation, so that all of the costs + of CONFIG_NO_HZ_IDLE are also incurred by CONFIG_NO_HZ_FULL. + +2. The user/kernel transitions are slightly more expensive due + to the need to inform kernel subsystems (such as RCU) about + the change in mode. + +3. POSIX CPU timers on adaptive-tick CPUs may miss their deadlines + (perhaps indefinitely) because they currently rely on + scheduling-tick interrupts. This will likely be fixed in + one of two ways: (1) Prevent CPUs with POSIX CPU timers from + entering adaptive-tick mode, or (2) Use hrtimers or other + adaptive-ticks-immune mechanism to cause the POSIX CPU timer to + fire properly. + +4. If there are more perf events pending than the hardware can + accommodate, they are normally round-robined so as to collect + all of them over time. Adaptive-tick mode may prevent this + round-robining from happening. This will likely be fixed by + preventing CPUs with large numbers of perf events pending from + entering adaptive-tick mode. + +5. Scheduler statistics for adaptive-tick CPUs may be computed + slightly differently than those for non-adaptive-tick CPUs. + This might in turn perturb load-balancing of real-time tasks. + +6. The LB_BIAS scheduler feature is disabled by adaptive ticks. + +Although improvements are expected over time, adaptive ticks is quite +useful for many types of real-time and compute-intensive applications. +However, the drawbacks listed above mean that adaptive ticks should not +(yet) be enabled by default. + + +RCU IMPLICATIONS + +There are situations in which idle CPUs cannot be permitted to +enter either dyntick-idle mode or adaptive-tick mode, the most +common being when that CPU has RCU callbacks pending. + +The CONFIG_RCU_FAST_NO_HZ=y Kconfig option may be used to cause such CPUs +to enter dyntick-idle mode or adaptive-tick mode anyway. In this case, +a timer will awaken these CPUs every four jiffies in order to ensure +that the RCU callbacks are processed in a timely fashion. + +Another approach is to offload RCU callback processing to "rcuo" kthreads +using the CONFIG_RCU_NOCB_CPU=y Kconfig option. The specific CPUs to +offload may be selected via several methods: + +1. One of three mutually exclusive Kconfig options specify a + build-time default for the CPUs to offload: + + a. The CONFIG_RCU_NOCB_CPU_NONE=y Kconfig option results in + no CPUs being offloaded. + + b. The CONFIG_RCU_NOCB_CPU_ZERO=y Kconfig option causes + CPU 0 to be offloaded. + + c. The CONFIG_RCU_NOCB_CPU_ALL=y Kconfig option causes all + CPUs to be offloaded. Note that the callbacks will be + offloaded to "rcuo" kthreads, and that those kthreads + will in fact run on some CPU. However, this approach + gives fine-grained control on exactly which CPUs the + callbacks run on, along with their scheduling priority + (including the default of SCHED_OTHER), and it further + allows this control to be varied dynamically at runtime. + +2. The "rcu_nocbs=" kernel boot parameter, which takes a comma-separated + list of CPUs and CPU ranges, for example, "1,3-5" selects CPUs 1, + 3, 4, and 5. The specified CPUs will be offloaded in addition to + any CPUs specified as offloaded by CONFIG_RCU_NOCB_CPU_ZERO=y or + CONFIG_RCU_NOCB_CPU_ALL=y. This means that the "rcu_nocbs=" boot + parameter has no effect for kernels built with RCU_NOCB_CPU_ALL=y. + +The offloaded CPUs will never queue RCU callbacks, and therefore RCU +never prevents offloaded CPUs from entering either dyntick-idle mode +or adaptive-tick mode. That said, note that it is up to userspace to +pin the "rcuo" kthreads to specific CPUs if desired. Otherwise, the +scheduler will decide where to run them, which might or might not be +where you want them to run. + + +KNOWN ISSUES + +o Dyntick-idle slows transitions to and from idle slightly. + In practice, this has not been a problem except for the most + aggressive real-time workloads, which have the option of disabling + dyntick-idle mode, an option that most of them take. However, + some workloads will no doubt want to use adaptive ticks to + eliminate scheduling-clock interrupt latencies. Here are some + options for these workloads: + + a. Use PMQOS from userspace to inform the kernel of your + latency requirements (preferred). + + b. On x86 systems, use the "idle=mwait" boot parameter. + + c. On x86 systems, use the "intel_idle.max_cstate=" to limit + ` the maximum C-state depth. + + d. On x86 systems, use the "idle=poll" boot parameter. + However, please note that use of this parameter can cause + your CPU to overheat, which may cause thermal throttling + to degrade your latencies -- and that this degradation can + be even worse than that of dyntick-idle. Furthermore, + this parameter effectively disables Turbo Mode on Intel + CPUs, which can significantly reduce maximum performance. + +o Adaptive-ticks slows user/kernel transitions slightly. + This is not expected to be a problem for computationally intensive + workloads, which have few such transitions. Careful benchmarking + will be required to determine whether or not other workloads + are significantly affected by this effect. + +o Adaptive-ticks does not do anything unless there is only one + runnable task for a given CPU, even though there are a number + of other situations where the scheduling-clock tick is not + needed. To give but one example, consider a CPU that has one + runnable high-priority SCHED_FIFO task and an arbitrary number + of low-priority SCHED_OTHER tasks. In this case, the CPU is + required to run the SCHED_FIFO task until it either blocks or + some other higher-priority task awakens on (or is assigned to) + this CPU, so there is no point in sending a scheduling-clock + interrupt to this CPU. However, the current implementation + nevertheless sends scheduling-clock interrupts to CPUs having a + single runnable SCHED_FIFO task and multiple runnable SCHED_OTHER + tasks, even though these interrupts are unnecessary. + + Better handling of these sorts of situations is future work. + +o A reboot is required to reconfigure both adaptive idle and RCU + callback offloading. Runtime reconfiguration could be provided + if needed, however, due to the complexity of reconfiguring RCU at + runtime, there would need to be an earthshakingly good reason. + Especially given that you have the straightforward option of + simply offloading RCU callbacks from all CPUs and pinning them + where you want them whenever you want them pinned. + +o Additional configuration is required to deal with other sources + of OS jitter, including interrupts and system-utility tasks + and processes. This configuration normally involves binding + interrupts and tasks to particular CPUs. + +o Some sources of OS jitter can currently be eliminated only by + constraining the workload. For example, the only way to eliminate + OS jitter due to global TLB shootdowns is to avoid the unmapping + operations (such as kernel module unload operations) that + result in these shootdowns. For another example, page faults + and TLB misses can be reduced (and in some cases eliminated) by + using huge pages and by constraining the amount of memory used + by the application. Pre-faulting the working set can also be + helpful, especially when combined with the mlock() and mlockall() + system calls. + +o Unless all CPUs are idle, at least one CPU must keep the + scheduling-clock interrupt going in order to support accurate + timekeeping. + +o If there are adaptive-ticks CPUs, there will be at least one + CPU keeping the scheduling-clock interrupt going, even if all + CPUs are otherwise idle. From 6296ace467c8640317414ba589b124323806f7ce Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Sun, 28 Apr 2013 11:25:58 +0800 Subject: [PATCH 37/39] nohz: Protect smp_processor_id() in tick_nohz_task_switch() I saw following error when testing the latest nohz code on Power: [ 85.295384] BUG: using smp_processor_id() in preemptible [00000000] code: rsyslogd/3493 [ 85.295396] caller is .tick_nohz_task_switch+0x1c/0xb8 [ 85.295402] Call Trace: [ 85.295408] [c0000001fababab0] [c000000000012dc4] .show_stack+0x110/0x25c (unreliable) [ 85.295420] [c0000001fababba0] [c0000000007c4b54] .dump_stack+0x20/0x30 [ 85.295430] [c0000001fababc10] [c00000000044eb74] .debug_smp_processor_id+0xf4/0x124 [ 85.295438] [c0000001fababca0] [c0000000000d7594] .tick_nohz_task_switch+0x1c/0xb8 [ 85.295447] [c0000001fababd20] [c0000000000b9748] .finish_task_switch+0x13c/0x160 [ 85.295455] [c0000001fababdb0] [c0000000000bbe50] .schedule_tail+0x50/0x124 [ 85.295463] [c0000001fababe30] [c000000000009dc8] .ret_from_fork+0x4/0x54 The code below moves the test into local_irq_save/restore section to avoid the above complaint. Signed-off-by: Li Zhong Acked-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul McKenney Link: http://lkml.kernel.org/r/1367119558.6391.34.camel@ThinkPad-T5421.cn.ibm.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index da53c8f2beb5..1c9f53b2ddb7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -251,14 +251,15 @@ void tick_nohz_task_switch(struct task_struct *tsk) { unsigned long flags; - if (!tick_nohz_full_cpu(smp_processor_id())) - return; - local_irq_save(flags); + if (!tick_nohz_full_cpu(smp_processor_id())) + goto out; + if (tick_nohz_tick_stopped() && !can_stop_full_tick()) tick_nohz_full_kick(); +out: local_irq_restore(flags); } From 73c30828771acafb0a5e3a1c4cf75e6c5dc5f98a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 May 2013 01:28:12 +0200 Subject: [PATCH 38/39] rcu: Fix full dynticks' dependency on wide RCU nocb mode Commit 0637e029392386e6996f5d6574aadccee8315efa ("nohz: Select wide RCU nocb for full dynticks") intended to force CONFIG_RCU_NOCB_CPU_ALL=y when full dynticks is enabled. However this option is part of a choice menu and Kconfig's "select" instruction has no effect on such targets. Fix this by using reverse dependencies on the targets we don't want instead. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Christoph Lameter Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- init/Kconfig | 4 ++-- kernel/time/Kconfig | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 66f67afad4fa..b3fdf9aee3da 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -687,7 +687,7 @@ choice config RCU_NOCB_CPU_NONE bool "No build_forced no-CBs CPUs" - depends on RCU_NOCB_CPU + depends on RCU_NOCB_CPU && !NO_HZ_FULL help This option does not force any of the CPUs to be no-CBs CPUs. Only CPUs designated by the rcu_nocbs= boot parameter will be @@ -695,7 +695,7 @@ config RCU_NOCB_CPU_NONE config RCU_NOCB_CPU_ZERO bool "CPU 0 is a build_forced no-CBs CPU" - depends on RCU_NOCB_CPU + depends on RCU_NOCB_CPU && !NO_HZ_FULL help This option forces CPU 0 to be a no-CBs CPU. Additional CPUs may be designated as no-CBs CPUs using the rcu_nocbs= boot diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index a2ddd650cb92..e4c07b0692bb 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -109,7 +109,6 @@ config NO_HZ_FULL select NO_HZ_COMMON select RCU_USER_QS select RCU_NOCB_CPU - select RCU_NOCB_CPU_ALL select VIRT_CPU_ACCOUNTING_GEN select CONTEXT_TRACKING_FORCE select IRQ_WORK From 265f22a975c1e4cc3a4d1f94a3ec53ffbb6f5b9f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 May 2013 03:39:05 +0200 Subject: [PATCH 39/39] sched: Keep at least 1 tick per second for active dynticks tasks The scheduler doesn't yet fully support environments with a single task running without a periodic tick. In order to ensure we still maintain the duties of scheduler_tick(), keep at least 1 tick per second. This makes sure that we keep the progression of various scheduler accounting and background maintainance even with a very low granularity. Examples include cpu load, sched average, CFS entity vruntime, avenrun and events such as load balancing, amongst other details handled in sched_class::task_tick(). This limitation will be removed in the future once we get these individual items to work in full dynticks CPUs. Suggested-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Christoph Lameter Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/linux/sched.h | 1 + kernel/sched/core.c | 30 ++++++++++++++++++++++++++++++ kernel/sched/idle_task.c | 1 + kernel/sched/sched.h | 10 ++++++++++ kernel/time/tick-sched.c | 7 +++++++ 5 files changed, 49 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index ebf7095158a9..af008d7bad57 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1862,6 +1862,7 @@ static inline void wake_up_nohz_cpu(int cpu) { } #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(void); +extern u64 scheduler_tick_max_deferment(void); #else static inline bool sched_can_stop_tick(void) { return false; } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e94842d4400c..3bdf986a091a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2736,8 +2736,35 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif + rq_last_tick_reset(rq); } +#ifdef CONFIG_NO_HZ_FULL +/** + * scheduler_tick_max_deferment + * + * Keep at least one tick per second when a single + * active task is running because the scheduler doesn't + * yet completely support full dynticks environment. + * + * This makes sure that uptime, CFS vruntime, load + * balancing, etc... continue to move forward, even + * with a very low granularity. + */ +u64 scheduler_tick_max_deferment(void) +{ + struct rq *rq = this_rq(); + unsigned long next, now = ACCESS_ONCE(jiffies); + + next = rq->last_sched_tick + HZ; + + if (time_before_eq(next, now)) + return 0; + + return jiffies_to_usecs(next - now) * NSEC_PER_USEC; +} +#endif + notrace unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { @@ -6993,6 +7020,9 @@ void __init sched_init(void) #ifdef CONFIG_NO_HZ_COMMON rq->nohz_flags = 0; #endif +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = 0; +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b8ce77328341..d8da01008d39 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -17,6 +17,7 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) { idle_exit_fair(rq); + rq_last_tick_reset(rq); } static void post_schedule_idle(struct rq *rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 24dc29897749..ce39224d6155 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -409,6 +409,9 @@ struct rq { #ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; +#endif +#ifdef CONFIG_NO_HZ_FULL + unsigned long last_sched_tick; #endif int skip_clock_update; @@ -1090,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq) rq->nr_running--; } +static inline void rq_last_tick_reset(struct rq *rq) +{ +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = jiffies; +#endif +} + extern void update_rq_clock(struct rq *rq); extern void activate_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1c9f53b2ddb7..07929c633570 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -600,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, time_delta = KTIME_MAX; } +#ifdef CONFIG_NO_HZ_FULL + if (!ts->inidle) { + time_delta = min(time_delta, + scheduler_tick_max_deferment()); + } +#endif + /* * calculate the expiry time for the next timer wheel * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals