From ced6d5c11d3e7b342f1a80f908e6756ebd4b8ddd Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Fri, 22 Dec 2017 15:51:12 +0100 Subject: [PATCH 1/5] timers: Use deferrable base independent of base::nohz_active During boot and before base::nohz_active is set in the timer bases, deferrable timers are enqueued into the standard timer base. This works correctly as long as base::nohz_active is false. Once it base::nohz_active is set and a timer which was enqueued before that is accessed the lock selector code choses the lock of the deferred base. This causes unlocked access to the standard base and in case the timer is removed it does not clear the pending flag in the standard base bitmap which causes get_next_timer_interrupt() to return bogus values. To prevent that, the deferrable timers must be enqueued in the deferrable base, even when base::nohz_active is not set. Those deferrable timers also need to be expired unconditional. Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel") Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: stable@vger.kernel.org Cc: rt@linutronix.de Cc: Paul McKenney Link: https://lkml.kernel.org/r/20171222145337.633328378@linutronix.de --- kernel/time/timer.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ffebcf878fba..19a9c3da7698 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -823,11 +823,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && - (tflags & TIMER_DEFERRABLE)) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); return base; } @@ -837,11 +836,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && - (tflags & TIMER_DEFERRABLE)) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = this_cpu_ptr(&timer_bases[BASE_DEF]); return base; } @@ -1684,7 +1682,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) base->must_forward_clk = false; __run_timers(base); - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); } From 26456f87aca7157c057de65c9414b37f1ab881d1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 Dec 2017 21:37:25 +0100 Subject: [PATCH 2/5] timers: Reinitialize per cpu bases on hotplug The timer wheel bases are not (re)initialized on CPU hotplug. That leaves them with a potentially stale clk and next_expiry valuem, which can cause trouble then the CPU is plugged. Add a prepare callback which forwards the clock, sets next_expiry to far in the future and reset the control flags to a known state. Set base->must_forward_clk so the first timer which is queued will try to forward the clock to current jiffies. Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Sebastian Siewior Cc: Anna-Maria Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272152200.2431@nanos --- include/linux/cpuhotplug.h | 2 +- include/linux/timer.h | 4 +++- kernel/cpu.c | 4 ++-- kernel/time/timer.c | 15 +++++++++++++++ 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 201ab7267986..1a32e558eb11 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -86,7 +86,7 @@ enum cpuhp_state { CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, - CPUHP_TIMERS_DEAD, + CPUHP_TIMERS_PREPARE, CPUHP_MIPS_SOC_PREPARE, CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, diff --git a/include/linux/timer.h b/include/linux/timer.h index 04af640ea95b..2448f9cc48a3 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -207,9 +207,11 @@ unsigned long round_jiffies_up(unsigned long j); unsigned long round_jiffies_up_relative(unsigned long j); #ifdef CONFIG_HOTPLUG_CPU +int timers_prepare_cpu(unsigned int cpu); int timers_dead_cpu(unsigned int cpu); #else -#define timers_dead_cpu NULL +#define timers_prepare_cpu NULL +#define timers_dead_cpu NULL #endif #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 41376c3ac93b..97858477e586 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = { * before blk_mq_queue_reinit_notify() from notify_dead(), * otherwise a RCU stall occurs. */ - [CPUHP_TIMERS_DEAD] = { + [CPUHP_TIMERS_PREPARE] = { .name = "timers:dead", - .startup.single = NULL, + .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, /* Kicks the plugged cpu into life */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 19a9c3da7698..6be576e02209 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1853,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h } } +int timers_prepare_cpu(unsigned int cpu) +{ + struct timer_base *base; + int b; + + for (b = 0; b < NR_BASES; b++) { + base = per_cpu_ptr(&timer_bases[b], cpu); + base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->is_idle = false; + base->must_forward_clk = true; + } + return 0; +} + int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; From 5d62c183f9e9df1deeea0906d099a94e8a43047a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Dec 2017 15:51:13 +0100 Subject: [PATCH 3/5] nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick() The conditions in irq_exit() to invoke tick_nohz_irq_exit() which subsequently invokes tick_nohz_stop_sched_tick() are: if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) If need_resched() is not set, but a timer softirq is pending then this is an indication that the softirq code punted and delegated the execution to softirqd. need_resched() is not true because the current interrupted task takes precedence over softirqd. Invoking tick_nohz_irq_exit() in this case can cause an endless loop of timer interrupts because the timer wheel contains an expired timer, but softirqs are not yet executed. So it returns an immediate expiry request, which causes the timer to fire immediately again. Lather, rinse and repeat.... Prevent that by adding a check for a pending timer soft interrupt to the conditions in tick_nohz_stop_sched_tick() which avoid calling get_next_timer_interrupt(). That keeps the tick sched timer on the tick and prevents a repetitive programming of an already expired timer. Reported-by: Sebastian Siewior Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul McKenney Cc: Anna-Maria Gleixner Cc: Sebastian Siewior Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272156050.2431@nanos --- kernel/time/tick-sched.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 77555faf6fbc..f7cc7abfcf25 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) ts->next_tick = 0; } +static inline bool local_timer_softirq_pending(void) +{ + return local_softirq_pending() & TIMER_SOFTIRQ; +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { @@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; - if (rcu_needs_cpu(basemono, &next_rcu) || - arch_needs_cpu() || irq_work_needs_cpu()) { + /* + * Keep the periodic tick, when RCU, architecture or irq_work + * requests it. + * Aside of that check whether the local timer softirq is + * pending. If so its a bad idea to call get_next_timer_interrupt() + * because there is an already expired timer, so it will request + * immeditate expiry, which rearms the hardware timer with a + * minimal delta which brings us back to this place + * immediately. Lather, rinse and repeat... + */ + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || + irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* From fd45bb77ad682be728d1002431d77b8c73342836 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Dec 2017 15:51:14 +0100 Subject: [PATCH 4/5] timers: Invoke timer_start_debug() where it makes sense The timer start debug function is called before the proper timer base is set. As a consequence the trace data contains the stale CPU and flags values. Call the debug function after setting the new base and flags. Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel") Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Sebastian Siewior Cc: stable@vger.kernel.org Cc: rt@linutronix.de Cc: Paul McKenney Cc: Anna-Maria Gleixner Link: https://lkml.kernel.org/r/20171222145337.792907137@linutronix.de --- kernel/time/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 6be576e02209..89a9e1b4264a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1007,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; - debug_activate(timer, expires); - new_base = get_target_base(base, timer->flags); if (base != new_base) { @@ -1032,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } } + debug_activate(timer, expires); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance From 9f4533cd7334235cd4c9b9fb1b0b8791e2ba01a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Dec 2017 15:51:15 +0100 Subject: [PATCH 5/5] timerqueue: Document return values of timerqueue_add/del() The return values of timerqueue_add/del() are not documented in the kernel doc comment. Add proper documentation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Sebastian Siewior Cc: rt@linutronix.de Cc: Paul McKenney Cc: Anna-Maria Gleixner Link: https://lkml.kernel.org/r/20171222145337.872681338@linutronix.de --- lib/timerqueue.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/timerqueue.c b/lib/timerqueue.c index 4a720ed4fdaf..0d54bcbc8170 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -33,8 +33,9 @@ * @head: head of timerqueue * @node: timer node to be added * - * Adds the timer node to the timerqueue, sorted by the - * node's expires value. + * Adds the timer node to the timerqueue, sorted by the node's expires + * value. Returns true if the newly added timer is the first expiring timer in + * the queue. */ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { @@ -70,7 +71,8 @@ EXPORT_SYMBOL_GPL(timerqueue_add); * @head: head of timerqueue * @node: timer node to be removed * - * Removes the timer node from the timerqueue. + * Removes the timer node from the timerqueue. Returns true if the queue is + * not empty after the remove. */ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) {