From 519248f36d6f3c80e176f6fa844c10d94f1f5990 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 May 2019 05:39:25 -0700 Subject: [PATCH 01/28] lockdep: Make print_lock() address visible Security is a wonderful thing, but so is the ability to debug based on lockdep warnings. This commit therefore makes lockdep lock addresses visible in the clear. Signed-off-by: Paul E. McKenney --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4861cf8e274b..4aca3f4379d2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -620,7 +620,7 @@ static void print_lock(struct held_lock *hlock) return; } - printk(KERN_CONT "%p", hlock->instance); + printk(KERN_CONT "%px", hlock->instance); print_lock_name(lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } From b55bd585551ed2220eefdab96b31e6f935310eec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 May 2019 05:39:25 -0700 Subject: [PATCH 02/28] time/tick-broadcast: Fix tick_broadcast_offline() lockdep complaint The TASKS03 and TREE04 rcutorture scenarios produce the following lockdep complaint: ------------------------------------------------------------------------ ================================ WARNING: inconsistent lock state 5.2.0-rc1+ #513 Not tainted -------------------------------- inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. migration/1/14 [HC0[0]:SC0[0]:HE1:SE1] takes: (____ptrval____) (tick_broadcast_lock){?...}, at: tick_broadcast_offline+0xf/0x70 {IN-HARDIRQ-W} state was registered at: lock_acquire+0xb0/0x1c0 _raw_spin_lock_irqsave+0x3c/0x50 tick_broadcast_switch_to_oneshot+0xd/0x40 tick_switch_to_oneshot+0x4f/0xd0 hrtimer_run_queues+0xf3/0x130 run_local_timers+0x1c/0x50 update_process_times+0x1c/0x50 tick_periodic+0x26/0xc0 tick_handle_periodic+0x1a/0x60 smp_apic_timer_interrupt+0x80/0x2a0 apic_timer_interrupt+0xf/0x20 _raw_spin_unlock_irqrestore+0x4e/0x60 rcu_nocb_gp_kthread+0x15d/0x590 kthread+0xf3/0x130 ret_from_fork+0x3a/0x50 irq event stamp: 171 hardirqs last enabled at (171): [] trace_hardirqs_on_thunk+0x1a/0x1c hardirqs last disabled at (170): [] trace_hardirqs_off_thunk+0x1a/0x1c softirqs last enabled at (0): [] copy_process.part.56+0x650/0x1cb0 softirqs last disabled at (0): [<0000000000000000>] 0x0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(tick_broadcast_lock); lock(tick_broadcast_lock); *** DEADLOCK *** 1 lock held by migration/1/14: #0: (____ptrval____) (clockevents_lock){+.+.}, at: tick_offline_cpu+0xf/0x30 stack backtrace: CPU: 1 PID: 14 Comm: migration/1 Not tainted 5.2.0-rc1+ #513 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x5e/0x8b print_usage_bug+0x1fc/0x216 ? print_shortest_lock_dependencies+0x1b0/0x1b0 mark_lock+0x1f2/0x280 __lock_acquire+0x1e0/0x18f0 ? __lock_acquire+0x21b/0x18f0 ? _raw_spin_unlock_irqrestore+0x4e/0x60 lock_acquire+0xb0/0x1c0 ? tick_broadcast_offline+0xf/0x70 _raw_spin_lock+0x33/0x40 ? tick_broadcast_offline+0xf/0x70 tick_broadcast_offline+0xf/0x70 tick_offline_cpu+0x16/0x30 take_cpu_down+0x7d/0xa0 multi_cpu_stop+0xa2/0xe0 ? cpu_stop_queue_work+0xc0/0xc0 cpu_stopper_thread+0x6d/0x100 smpboot_thread_fn+0x169/0x240 kthread+0xf3/0x130 ? sort_range+0x20/0x20 ? kthread_cancel_delayed_work_sync+0x10/0x10 ret_from_fork+0x3a/0x50 ------------------------------------------------------------------------ To reproduce, run the following rcutorture test: tools/testing/selftests/rcutorture/bin/kvm.sh --duration 5 --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" --configs "TASKS03 TREE04" It turns out that tick_broadcast_offline() was an innocent bystander. After all, interrupts are supposed to be disabled throughout take_cpu_down(), and therefore should have been disabled upon entry to tick_offline_cpu() and thus to tick_broadcast_offline(). This suggests that one of the CPU-hotplug notifiers was incorrectly enabling interrupts, and leaving them enabled on return. Some debugging code showed that the culprit was sched_cpu_dying(). It had irqs enabled after return from sched_tick_stop(). Which in turn had irqs enabled after return from cancel_delayed_work_sync(). Which is a wrapper around __cancel_work_timer(). Which can sleep in the case where something else is concurrently trying to cancel the same delayed work, and as Thomas Gleixner pointed out on IRC, sleeping is a decidedly bad idea when you are invoked from take_cpu_down(), regardless of the state you leave interrupts in upon return. Code inspection located no reason why the delayed work absolutely needed to be canceled from sched_tick_stop(): The work is not bound to the outgoing CPU by design, given that the whole point is to collect statistics without disturbing the outgoing CPU. This commit therefore simply drops the cancel_delayed_work_sync() from sched_tick_stop(). Instead, a new ->state field is added to the tick_work structure so that the delayed-work handler function sched_tick_remote() can avoid reposting itself. A cpu_is_offline() check is also added to sched_tick_remote() to avoid mucking with the state of an offlined CPU (though it does appear safe to do so). The sched_tick_start() and sched_tick_stop() functions also update ->state, and sched_tick_start() also schedules the delayed work if ->state indicates that it is not already in flight. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Reviewed-by: Frederic Weisbecker [ paulmck: Apply Peter Zijlstra and Frederic Weisbecker atomics feedback. ] Acked-by: Peter Zijlstra (Intel) --- kernel/sched/core.c | 57 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..0b22e55cebe8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3486,8 +3486,36 @@ void scheduler_tick(void) struct tick_work { int cpu; + atomic_t state; struct delayed_work work; }; +/* Values for ->state, see diagram below. */ +#define TICK_SCHED_REMOTE_OFFLINE 0 +#define TICK_SCHED_REMOTE_OFFLINING 1 +#define TICK_SCHED_REMOTE_RUNNING 2 + +/* + * State diagram for ->state: + * + * + * TICK_SCHED_REMOTE_OFFLINE + * | ^ + * | | + * | | sched_tick_remote() + * | | + * | | + * +--TICK_SCHED_REMOTE_OFFLINING + * | ^ + * | | + * sched_tick_start() | | sched_tick_stop() + * | | + * V | + * TICK_SCHED_REMOTE_RUNNING + * + * + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() + * and sched_tick_start() are happy to leave the state in RUNNING. + */ static struct tick_work __percpu *tick_work_cpu; @@ -3500,6 +3528,7 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr; struct rq_flags rf; u64 delta; + int os; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3513,7 +3542,7 @@ static void sched_tick_remote(struct work_struct *work) rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr)) + if (is_idle_task(curr) || cpu_is_offline(cpu)) goto out_unlock; update_rq_clock(rq); @@ -3533,13 +3562,18 @@ out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough - * to keep scheduler internal stats reasonably up to date. + * to keep scheduler internal stats reasonably up to date. But + * first update state to reflect hotplug activity if required. */ - queue_delayed_work(system_unbound_wq, dwork, HZ); + os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); + if (os == TICK_SCHED_REMOTE_RUNNING) + queue_delayed_work(system_unbound_wq, dwork, HZ); } static void sched_tick_start(int cpu) { + int os; struct tick_work *twork; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) @@ -3548,15 +3582,20 @@ static void sched_tick_start(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - twork->cpu = cpu; - INIT_DELAYED_WORK(&twork->work, sched_tick_remote); - queue_delayed_work(system_unbound_wq, &twork->work, HZ); + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); + if (os == TICK_SCHED_REMOTE_OFFLINE) { + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); + } } #ifdef CONFIG_HOTPLUG_CPU static void sched_tick_stop(int cpu) { struct tick_work *twork; + int os; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) return; @@ -3564,7 +3603,10 @@ static void sched_tick_stop(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - cancel_delayed_work_sync(&twork->work); + /* There cannot be competing actions, but don't rely on stop-machine. */ + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); + WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); + /* Don't cancel, as this would mess up the state machine. */ } #endif /* CONFIG_HOTPLUG_CPU */ @@ -3572,7 +3614,6 @@ int __init sched_tick_offload_init(void) { tick_work_cpu = alloc_percpu(struct tick_work); BUG_ON(!tick_work_cpu); - return 0; } From 1f3ebc8253ee56bfaa883c5114fb5569c56f6197 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Jun 2019 14:05:52 -0700 Subject: [PATCH 03/28] rcu: Restore barrier() to rcu_read_lock() and rcu_read_unlock() Commit bb73c52bad36 ("rcu: Don't disable preemption for Tiny and Tree RCU readers") removed the barrier() calls from rcu_read_lock() and rcu_write_lock() in CONFIG_PREEMPT=n&&CONFIG_PREEMPT_COUNT=n kernels. Within RCU, this commit was OK, but it failed to account for things like get_user() that can pagefault and that can be reordered by the compiler. Lack of the barrier() calls in rcu_read_lock() and rcu_read_unlock() can cause these page faults to migrate into RCU read-side critical sections, which in CONFIG_PREEMPT=n kernels could result in too-short grace periods and arbitrary misbehavior. Please see commit 386afc91144b ("spinlocks and preemption points need to be at least compiler barriers") and Linus's commit 66be4e66a7f4 ("rcu: locking and unlocking need to always be at least barriers"), this last of which restores the barrier() call to both rcu_read_lock() and rcu_read_unlock(). This commit removes barrier() calls that are no longer needed given that the addition of them in Linus's commit noted above. The combination of this commit and Linus's commit effectively reverts commit bb73c52bad36 ("rcu: Don't disable preemption for Tiny and Tree RCU readers"). Reported-by: Herbert Xu Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney [ paulmck: Fix embarrassing typo located by Alan Stern. ] --- .../RCU/Design/Requirements/Requirements.html | 71 +++++++++++++++++++ kernel/rcu/tree_plugin.h | 11 --- 2 files changed, 71 insertions(+), 11 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 5a9238a2883c..f04c467e55c5 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2129,6 +2129,8 @@ Some of the relevant points of interest are as follows:
  • Hotplug CPU.
  • Scheduler and RCU.
  • Tracing and RCU. +
  • +Accesses to User Memory and RCU.
  • Energy Efficiency.
  • Scheduling-Clock Interrupts and RCU. @@ -2521,6 +2523,75 @@ cannot be used. The tracing folks both located the requirement and provided the needed fix, so this surprise requirement was relatively painless. +

    +Accesses to User Memory and RCU

    + +

    +The kernel needs to access user-space memory, for example, to access +data referenced by system-call parameters. +The get_user() macro does this job. + +

    +However, user-space memory might well be paged out, which means +that get_user() might well page-fault and thus block while +waiting for the resulting I/O to complete. +It would be a very bad thing for the compiler to reorder +a get_user() invocation into an RCU read-side critical +section. +For example, suppose that the source code looked like this: + +

    +
    + 1 rcu_read_lock();
    + 2 p = rcu_dereference(gp);
    + 3 v = p->value;
    + 4 rcu_read_unlock();
    + 5 get_user(user_v, user_p);
    + 6 do_something_with(v, user_v);
    +
    +
    + +

    +The compiler must not be permitted to transform this source code into +the following: + +

    +
    + 1 rcu_read_lock();
    + 2 p = rcu_dereference(gp);
    + 3 get_user(user_v, user_p); // BUG: POSSIBLE PAGE FAULT!!!
    + 4 v = p->value;
    + 5 rcu_read_unlock();
    + 6 do_something_with(v, user_v);
    +
    +
    + +

    +If the compiler did make this transformation in a +CONFIG_PREEMPT=n kernel build, and if get_user() did +page fault, the result would be a quiescent state in the middle +of an RCU read-side critical section. +This misplaced quiescent state could result in line 4 being +a use-after-free access, which could be bad for your kernel's +actuarial statistics. +Similar examples can be constructed with the call to get_user() +preceding the rcu_read_lock(). + +

    +Unfortunately, get_user() doesn't have any particular +ordering properties, and in some architectures the underlying asm +isn't even marked volatile. +And even if it was marked volatile, the above access to +p->value is not volatile, so the compiler would not have any +reason to keep those two accesses in order. + +

    +Therefore, the Linux-kernel definitions of rcu_read_lock() +and rcu_read_unlock() must act as compiler barriers, +at least for outermost instances of rcu_read_lock() and +rcu_read_unlock() within a nested set of RCU read-side critical +sections. +

    Energy Efficiency

    diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index acb225023ed1..3f1b5041de9b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -288,7 +288,6 @@ void rcu_note_context_switch(bool preempt) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp; - barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); @@ -340,7 +339,6 @@ void rcu_note_context_switch(bool preempt) if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); trace_rcu_utilization(TPS("End context switch")); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -828,11 +826,6 @@ static void rcu_qs(void) * dyntick-idle quiescent state visible to other CPUs, which will in * some cases serve for expedited as well as normal grace periods. * Either way, register a lightweight quiescent state. - * - * The barrier() calls are redundant in the common case when this is - * called externally, but just in case this is called from within this - * file. - * */ void rcu_all_qs(void) { @@ -847,14 +840,12 @@ void rcu_all_qs(void) return; } this_cpu_write(rcu_data.rcu_urgent_qs, false); - barrier(); /* Avoid RCU read-side critical sections leaking down. */ if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); } rcu_qs(); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ preempt_enable(); } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -864,7 +855,6 @@ EXPORT_SYMBOL_GPL(rcu_all_qs); */ void rcu_note_context_switch(bool preempt) { - barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_qs(); /* Load rcu_urgent_qs before other flags. */ @@ -877,7 +867,6 @@ void rcu_note_context_switch(bool preempt) rcu_tasks_qs(current); out: trace_rcu_utilization(TPS("End context switch")); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); From cdc694b2359d52cd6d0465d5a6263d97c786fb0c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 Jun 2019 15:30:49 -0700 Subject: [PATCH 04/28] rcu: Add kernel parameter to dump trace after RCU CPU stall warning This commit adds a rcu_cpu_stall_ftrace_dump kernel boot parameter, that, when set, causes the trace buffer to be dumped after an RCU CPU stall warning is printed. This kernel boot parameter is disabled by default, maintaining compatibility with previous behavior. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ kernel/rcu/rcu.h | 1 + kernel/rcu/tree_stall.h | 4 ++++ kernel/rcu/update.c | 2 ++ 4 files changed, 11 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 7ccd158b3894..f3fcd6140ee1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4047,6 +4047,10 @@ rcutorture.verbose= [KNL] Enable additional printk() statements. + rcupdate.rcu_cpu_stall_ftrace_dump= [KNL] + Dump ftrace buffer after reporting RCU CPU + stall warning. + rcupdate.rcu_cpu_stall_suppress= [KNL] Suppress RCU CPU stall warning messages. diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 5290b01de534..8fd4f82c9b3d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -227,6 +227,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) #ifdef CONFIG_RCU_STALL_COMMON +extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 065183391f75..0627a66699a6 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -527,6 +527,8 @@ static void check_cpu_stall(struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(); + if (rcu_cpu_stall_ftrace_dump) + rcu_ftrace_dump(DUMP_ALL); } else if (rcu_gp_in_progress() && ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && @@ -534,6 +536,8 @@ static void check_cpu_stall(struct rcu_data *rdp) /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2); + if (rcu_cpu_stall_ftrace_dump) + rcu_ftrace_dump(DUMP_ALL); } } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 61df2bf08563..249517058b13 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -437,6 +437,8 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); #endif #ifdef CONFIG_RCU_STALL_COMMON +int rcu_cpu_stall_ftrace_dump __read_mostly; +module_param(rcu_cpu_stall_ftrace_dump, int, 0644); int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); From fbad01af8c3bb9618848abde8054ab7e0c2330fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Jun 2019 15:42:51 -0700 Subject: [PATCH 05/28] rcu: Add destroy_work_on_stack() to match INIT_WORK_ONSTACK() The synchronize_rcu_expedited() function has an INIT_WORK_ONSTACK(), but lacks the corresponding destroy_work_on_stack(). This commit therefore adds destroy_work_on_stack(). Reported-by: Andrea Arcangeli Signed-off-by: Paul E. McKenney Acked-by: Andrea Arcangeli --- kernel/rcu/tree_exp.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index af7e7b9c86af..513b403b683b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -792,6 +792,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { + bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); struct rcu_exp_work rew; struct rcu_node *rnp; unsigned long s; @@ -817,7 +818,7 @@ void synchronize_rcu_expedited(void) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ - if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { + if (unlikely(boottime)) { /* Direct call during scheduler init and early_initcalls(). */ rcu_exp_sel_wait_wake(s); } else { @@ -835,5 +836,8 @@ void synchronize_rcu_expedited(void) /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); + + if (likely(!boottime)) + destroy_work_on_stack(&rew.rew_work); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); From 7e210a653ec9445512534cd235cac29e7301af2a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Jun 2019 17:11:10 -0700 Subject: [PATCH 06/28] srcu: Avoid srcutorture security-based pointer obfuscation Because pointer output is now obfuscated, and because what you really want to know is whether or not the callback lists are empty, this commit replaces the srcu_data structure's head callback pointer printout with a single character that is "." is the callback list is empty or "C" otherwise. This is the only remaining user of rcu_segcblist_head(), so this commit also removes this function's definition. It also turns out that rcu_segcblist_tail() no longer has any callers, so this commit removes that function's definition while in the area. They were both marked "Interim", and their end has come. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.h | 21 --------------------- kernel/rcu/srcutree.c | 5 +++-- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 71b64648464e..822a39da0533 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -76,27 +76,6 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) return !*rsclp->tails[seg]; } -/* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) -{ - return rsclp->head; -} - -/* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) -{ - WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); - return rsclp->tails[RCU_NEXT_TAIL]; -} - void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index cf0e886314f2..5dffade2d7cd 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1279,8 +1279,9 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) c0 = l0 - u0; c1 = l1 - u1; - pr_cont(" %d(%ld,%ld %1p)", - cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); + pr_cont(" %d(%ld,%ld %c)", + cpu, c0, c1, + "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); s0 += c0; s1 += c1; } From 3545832fc22e2316d9c289f6ba825710a268bfa6 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 1 Jul 2019 09:40:39 +0900 Subject: [PATCH 07/28] rcu: Change return type of rcu_spawn_one_boost_kthread() The return value of rcu_spawn_one_boost_kthread() is not used any longer. This commit therefore changes its return type from int to void, and removes the cast to void from its callers. Signed-off-by: Byungchul Park Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3f1b5041de9b..307ae6ebb804 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1123,7 +1123,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * already exist. We only create this kthread for preemptible RCU. * Returns zero if all is well, a negated errno otherwise. */ -static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) +static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { int rnp_index = rnp - rcu_get_root(); unsigned long flags; @@ -1131,25 +1131,27 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct task_struct *t; if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) - return 0; + return; if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) - return 0; + return; rcu_state.boost = 1; + if (rnp->boost_kthread_task != NULL) - return 0; + return; + t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); + if (WARN_ON_ONCE(IS_ERR(t))) + return; + raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - return 0; } /* @@ -1190,7 +1192,7 @@ static void __init rcu_spawn_boost_kthreads(void) struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) - (void)rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_one_boost_kthread(rnp); } static void rcu_prepare_kthreads(int cpu) @@ -1200,7 +1202,7 @@ static void rcu_prepare_kthreads(int cpu) /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ if (rcu_scheduler_fully_active) - (void)rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_one_boost_kthread(rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ From 0500873de968df6fdef5752d7bbdca317ddc220b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Jul 2019 08:01:50 -0700 Subject: [PATCH 08/28] doc: Add rcutree.kthread_prio pointer to stallwarn.txt This commit adds mention of the rcutree.kthread_prio kernel boot parameter to the discussion of how high-priority real-time tasks can result in RCU CPU stall warnings. (However, this does not necessarily help when the high-priority real-time tasks are using dubious deadlines.) Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 13e88fc00f01..f48f4621ccbc 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -57,6 +57,12 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that CONFIG_PREEMPT_RCU case, you might see stall-warning messages. + You can use the rcutree.kthread_prio kernel boot parameter to + increase the scheduling priority of RCU's kthreads, which can + help avoid this problem. However, please note that doing this + can increase your system's context-switch rate and thus degrade + performance. + o A periodic interrupt whose handler takes longer than the time interval between successive pairs of interrupts. This can prevent RCU's kthreads and softirq handlers from running. From 0a5b99f57873e233ad42ef71e23c629f6ea1fcfe Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 11 Jul 2019 16:45:41 -0400 Subject: [PATCH 09/28] treewide: Rename rcu_dereference_raw_notrace() to _check() The rcu_dereference_raw_notrace() API name is confusing. It is equivalent to rcu_dereference_raw() except that it also does sparse pointer checking. There are only a few users of rcu_dereference_raw_notrace(). This patches renames all of them to be rcu_dereference_raw_check() with the "_check()" indicating sparse checking. Signed-off-by: Joel Fernandes (Google) [ paulmck: Fix checkpatch warnings about parentheses. ] Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Requirements/Requirements.html | 2 +- arch/powerpc/include/asm/kvm_book3s_64.h | 2 +- include/linux/rculist.h | 6 +++--- include/linux/rcupdate.h | 2 +- kernel/trace/ftrace_internal.h | 8 ++++---- kernel/trace/trace.c | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 5a9238a2883c..bdbc84f1b949 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2512,7 +2512,7 @@ disabled across the entire RCU read-side critical section.

    It is possible to use tracing on RCU code, but tracing itself uses RCU. -For this reason, rcu_dereference_raw_notrace() +For this reason, rcu_dereference_raw_check() is provided for use by tracing, which avoids the destructive recursion that could otherwise ensue. This API is also used by virtualization in some architectures, diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index bb7c8cc77f1a..04b2b927bb5a 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -535,7 +535,7 @@ static inline void note_hpte_modification(struct kvm *kvm, */ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) { - return rcu_dereference_raw_notrace(kvm->memslots[0]); + return rcu_dereference_raw_check(kvm->memslots[0]); } extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); diff --git a/include/linux/rculist.h b/include/linux/rculist.h index e91ec9ddcd30..932296144131 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -622,7 +622,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu(pos, head, member) \ - for (pos = hlist_entry_safe (rcu_dereference_raw(hlist_first_rcu(head)),\ + for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ @@ -642,10 +642,10 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * not do any RCU debugging or tracing. */ #define hlist_for_each_entry_rcu_notrace(pos, head, member) \ - for (pos = hlist_entry_safe (rcu_dereference_raw_notrace(hlist_first_rcu(head)),\ + for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ - pos = hlist_entry_safe(rcu_dereference_raw_notrace(hlist_next_rcu(\ + pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8f7167478c1d..bfcafbc1e301 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -476,7 +476,7 @@ do { \ * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ -#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu) +#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 0515a2096f90..0456e0a3dab1 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -6,22 +6,22 @@ /* * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw_notrace() is that elements removed from this list + * can use rcu_dereference_raw_check() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle + * mechanism. The rcu_dereference_raw_check() calls are needed to handle * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ #define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw_notrace(list); \ + op = rcu_dereference_raw_check(list); \ do /* * Optimized for just a single item in the list (as that is the normal case). */ #define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ + while (likely(op = rcu_dereference_raw_check((op)->next)) && \ unlikely((op) != &ftrace_list_end)) extern struct ftrace_ops __rcu *ftrace_ops_list; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 525a97fbbc60..642474b26ba7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2642,10 +2642,10 @@ static void ftrace_exports(struct ring_buffer_event *event) preempt_disable_notrace(); - export = rcu_dereference_raw_notrace(ftrace_exports_list); + export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event); - export = rcu_dereference_raw_notrace(export->next); + export = rcu_dereference_raw_check(export->next); } preempt_enable_notrace(); From 9147089bee3a6b504821dd8462e2be229e6dbfae Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 18:12:21 -0400 Subject: [PATCH 10/28] rcu: Remove redundant debug_locks check in rcu_read_lock_sched_held() The debug_locks flag can never be true at the end of rcu_read_lock_sched_held() because it is already checked by the earlier call todebug_lockdep_rcu_enabled(). This commit therefore removes this redundant check. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 61df2bf08563..9dd5aeef6e70 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -93,17 +93,13 @@ module_param(rcu_normal_after_boot, int, 0); */ int rcu_read_lock_sched_held(void) { - int lockdep_opinion = 0; - if (!debug_lockdep_rcu_enabled()) return 1; if (!rcu_is_watching()) return 0; if (!rcu_lockdep_current_cpu_online()) return 0; - if (debug_locks) - lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || !preemptible(); + return lock_is_held(&rcu_sched_lock_map) || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif From b3f3886c59f649ace424d132bd8c06e3611c71a8 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Fri, 31 May 2019 23:15:45 +0800 Subject: [PATCH 11/28] rcuperf: Fix perf_type module-parameter description The rcu_bh rcuperf type was removed by commit 620d246065cd("rcuperf: Remove the "rcu_bh" and "sched" torture types"), but it lives on in the MODULE_PARM_DESC() of perf_type. This commit therefore changes that module-parameter description to substitute srcu for rcu_bh. Signed-off-by: Xiao Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 7a6890b23c5f..4513807cd4c4 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)"); static int nrealreaders; static int nrealwriters; From 2c667e5eae232f7f4a4fc30f58e51abdb0dc43c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jun 2019 10:32:57 -0700 Subject: [PATCH 12/28] torture: Expand last_ts variable in kvm-test-1-run.sh The kvm-test-1-run.sh script says 'test -z "last_ts"' which always evaluates to true (AKA zero) regardless of the value of the last_ts shell variable. This commit therefore inserts the needed dollar sign ("$"). Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 27b7b5693ede..33c669619736 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -227,7 +227,7 @@ then must_continue=yes fi last_ts="`tail $resdir/console.log | grep '^\[ *[0-9]\+\.[0-9]\+]' | tail -1 | sed -e 's/^\[ *//' -e 's/\..*$//'`" - if test -z "last_ts" + if test -z "$last_ts" then last_ts=0 fi From f4e8352928587ef8772df3d269a328efa609daaa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jun 2019 14:05:54 -0700 Subject: [PATCH 13/28] rcutorture: Test TREE03 with the threadirqs kernel boot parameter Since commit 05f415715ce45 ("rcu: Speed up expedited GPs when interrupting RCU reader") in v5.0 and through v5.1, booting with the threadirqs kernel boot parameter caused self-deadlocks, which can be reproduced using the following command on an 8-CPU system: tools/testing/selftests/rcutorture/bin/kvm.sh --duration 5 --configs "TREE03" --bootargs "threadirqs" This commit therefore adds the threadirqs kernel boot parameter to the TREE03 rcutorture scenario in order to more quickly detect future similar bugs. Link: http://lkml.kernel.org/r/20190626135447.y24mvfuid5fifwjc@linutronix.de Signed-off-by: Paul E. McKenney Cc: Sebastian Andrzej Siewior Cc: Joel Fernandes --- tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index 5c3213cc3ad7..1c218944b1e9 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -3,3 +3,4 @@ rcutree.gp_preinit_delay=12 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcutree.kthread_prio=2 +threadirqs From bd1bfc51a36f334270b886db6d8467e55fe294ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Jun 2019 14:35:59 -0700 Subject: [PATCH 14/28] rcutorture: Emulate userspace sojourn during call_rcu() floods During an actual call_rcu() flood, there would be frequent trips to userspace (in-kernel call_rcu() floods must be otherwise housebroken). Userspace execution allows a great many things to interrupt execution, and rcutorture needs to also allow such interruptions. This commit therefore causes call_rcu() floods to occasionally invoke schedule(), thus preventing spurious rcutorture failures due to other parts of the kernel becoming irate at the call_rcu() flood events. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fce4e7e6f502..c44e5307afcc 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1713,12 +1713,14 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) } // Give the scheduler a chance, even on nohz_full CPUs. -static void rcu_torture_fwd_prog_cond_resched(void) +static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) { if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { - if (need_resched()) + // Real call_rcu() floods hit userspace, so emulate that. + if (need_resched() || (iter & 0xfff)) schedule(); } else { + // No userspace emulation: CB invocation throttles call_rcu() cond_resched(); } } @@ -1746,7 +1748,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) spin_unlock_irqrestore(&rcu_fwd_lock, flags); kfree(rfcp); freed++; - rcu_torture_fwd_prog_cond_resched(); + rcu_torture_fwd_prog_cond_resched(freed); } return freed; } @@ -1790,7 +1792,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) udelay(10); cur_ops->readunlock(idx); if (!fwd_progress_need_resched || need_resched()) - rcu_torture_fwd_prog_cond_resched(); + rcu_torture_fwd_prog_cond_resched(1); } (*tested_tries)++; if (!time_before(jiffies, stopat) && @@ -1875,7 +1877,7 @@ static void rcu_torture_fwd_prog_cr(void) rfcp->rfc_gps = 0; } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); - rcu_torture_fwd_prog_cond_resched(); + rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); } stoppedat = jiffies; n_launders_cb_snap = READ_ONCE(n_launders_cb); From 21f57546ceaf4c5537a617f55b809a843b109210 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Thu, 4 Jul 2019 15:57:19 +0300 Subject: [PATCH 15/28] torture: Remove exporting of internal functions The functions torture_onoff_cleanup() and torture_shuffle_cleanup() are declared static and marked EXPORT_SYMBOL_GPL(), which is at best an odd combination. Because these functions are not used outside of the kernel/torture.c file they are defined in, this commit removes their EXPORT_SYMBOL_GPL() marking. Fixes: cc47ae083026 ("rcutorture: Abstract torture-test cleanup") Signed-off-by: Denis Efremov Signed-off-by: Paul E. McKenney --- kernel/torture.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/torture.c b/kernel/torture.c index a8d9bdfba7c3..7c13f5558b71 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -263,7 +263,6 @@ static void torture_onoff_cleanup(void) onoff_task = NULL; #endif /* #ifdef CONFIG_HOTPLUG_CPU */ } -EXPORT_SYMBOL_GPL(torture_onoff_cleanup); /* * Print online/offline testing statistics. @@ -449,7 +448,6 @@ static void torture_shuffle_cleanup(void) } shuffler_task = NULL; } -EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); /* * Variables for auto-shutdown. This allows "lights out" torture runs From 77e9752ce69f36f1be4e366373727fb7921f5909 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 4 Jul 2019 00:34:30 -0400 Subject: [PATCH 16/28] rcuperf: Make rcuperf kernel test more robust for !expedited mode It is possible that the rcuperf kernel test runs concurrently with init starting up. During this time, the system is running all grace periods as expedited. However, rcuperf can also be run for normal GP tests. Right now, it depends on a holdoff time before starting the test to ensure grace periods start later. This works fine with the default holdoff time however it is not robust in situations where init takes greater than the holdoff time to finish running. Or, as in my case: I modified the rcuperf test locally to also run a thread that did preempt disable/enable in a loop. This had the effect of slowing down init. The end result was that the "batches:" counter in rcuperf was 0 causing a division by 0 error in the results. This counter was 0 because only expedited GPs seem to happen, not normal ones which led to the rcu_state.gp_seq counter remaining constant across grace periods which unexpectedly happen to be expedited. The system was running expedited RCU all the time because rcu_unexpedited_gp() would not have run yet from init. In other words, the test would concurrently with init booting in expedited GP mode. To fix this properly, this commit waits until system_state is set to SYSTEM_RUNNING before starting the test. This change is made just before kernel_init() invokes rcu_end_inkernel_boot(), and this latter is what turns off boot-time expediting of RCU grace periods. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 4513807cd4c4..5a879d073c1c 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -375,6 +375,14 @@ rcu_perf_writer(void *arg) if (holdoff) schedule_timeout_uninterruptible(holdoff * HZ); + /* + * Wait until rcu_end_inkernel_boot() is called for normal GP tests + * so that RCU is not always expedited for normal GP tests. + * The system_state test is approximate, but works well in practice. + */ + while (!gp_exp && system_state != SYSTEM_RUNNING) + schedule_timeout_uninterruptible(1); + t = ktime_get_mono_fast_ns(); if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; From 60013d5d2b4031e6027005e5e2dcb6ee6da6b186 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jul 2019 08:30:00 -0700 Subject: [PATCH 17/28] rcutorture: Aggressive forward-progress tests shouldn't block shutdown The more aggressive forward-progress tests can interfere with rcutorture shutdown, resulting in false-positive diagnostics. This commit therefore ends any such tests 30 seconds prior to shutdown. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c44e5307afcc..b22947324423 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -161,6 +161,7 @@ static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; /* did rcu_barrier test succeed? */ static struct list_head rcu_torture_removed; +static unsigned long shutdown_jiffies; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -228,6 +229,15 @@ static u64 notrace rcu_trace_clock_local(void) } #endif /* #else #ifdef CONFIG_RCU_TRACE */ +/* + * Stop aggressive CPU-hog tests a bit before the end of the test in order + * to avoid interfering with test shutdown. + */ +static bool shutdown_time_arrived(void) +{ + return shutdown_secs && time_after(jiffies, shutdown_jiffies - 30 * HZ); +} + static unsigned long boost_starttime; /* jiffies of next boost test start. */ static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ @@ -1787,6 +1797,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) WRITE_ONCE(rcu_fwd_startat, jiffies); stopat = rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { idx = cur_ops->readlock(); udelay(10); @@ -1796,6 +1807,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) } (*tested_tries)++; if (!time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { (*tested)++; cver = READ_ONCE(rcu_torture_current_version) - cver; @@ -1854,6 +1866,7 @@ static void rcu_torture_fwd_prog_cr(void) gps = cur_ops->get_gp_seq(); rcu_launder_gp_seq_start = gps; while (time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { rfcp = READ_ONCE(rcu_fwd_cb_head); rfcpn = NULL; @@ -1886,7 +1899,8 @@ static void rcu_torture_fwd_prog_cr(void) cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ (void)rcu_torture_fwd_prog_cbfree(); - if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { + if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && + !shutdown_time_arrived()) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", __func__, @@ -2467,6 +2481,7 @@ rcu_torture_init(void) goto unwind; rcutor_hp = firsterr; } + shutdown_jiffies = jiffies + shutdown_secs * HZ; firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) goto unwind; From 28875945ba98d1b47a8a706812b6494d165bb0a0 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 18:12:22 -0400 Subject: [PATCH 18/28] rcu: Add support for consolidated-RCU reader checking This commit adds RCU-reader checks to list_for_each_entry_rcu() and hlist_for_each_entry_rcu(). These checks are optional, and are indicated by a lockdep expression passed to a new optional argument to these two macros. If this optional lockdep expression is omitted, these two macros act as before, checking for an RCU read-side critical section. Signed-off-by: Joel Fernandes (Google) [ paulmck: Update to eliminate return within macro and update comment. ] Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 32 +++++++++++--- include/linux/rcupdate.h | 7 +++ kernel/rcu/Kconfig.debug | 11 +++++ kernel/rcu/update.c | 96 ++++++++++++++++++++++++++-------------- 4 files changed, 108 insertions(+), 38 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 932296144131..4158b7212936 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -40,6 +40,24 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) +/* + * Check during list traversal that we are within an RCU reader + */ + +#define check_arg_count_one(dummy) + +#ifdef CONFIG_PROVE_RCU_LIST +#define __list_check_rcu(dummy, cond, extra...) \ + ({ \ + check_arg_count_one(extra); \ + RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(), \ + "RCU-list traversed in non-reader section!"); \ + }) +#else +#define __list_check_rcu(dummy, cond, extra...) \ + ({ check_arg_count_one(extra); }) +#endif + /* * Insert a new entry between two known consecutive entries. * @@ -343,14 +361,16 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ -#define list_for_each_entry_rcu(pos, head, member) \ - for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \ - &pos->member != (head); \ +#define list_for_each_entry_rcu(pos, head, member, cond...) \ + for (__list_check_rcu(dummy, ## cond, 0), \ + pos = list_entry_rcu((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** @@ -616,13 +636,15 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ -#define hlist_for_each_entry_rcu(pos, head, member) \ - for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ +#define hlist_for_each_entry_rcu(pos, head, member, cond...) \ + for (__list_check_rcu(dummy, ## cond, 0), \ + pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index bfcafbc1e301..80d6056f5855 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -221,6 +221,7 @@ int debug_lockdep_rcu_enabled(void); int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); int rcu_read_lock_sched_held(void); +int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -241,6 +242,12 @@ static inline int rcu_read_lock_sched_held(void) { return !preemptible(); } + +static inline int rcu_read_lock_any_held(void) +{ + return !preemptible(); +} + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 5ec3ea4028e2..4aa02eee8f6c 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -8,6 +8,17 @@ menu "RCU Debugging" config PROVE_RCU def_bool PROVE_LOCKING +config PROVE_RCU_LIST + bool "RCU list lockdep debugging" + depends on PROVE_RCU && RCU_EXPERT + default n + help + Enable RCU lockdep checking for list usages. By default it is + turned off since there are several list RCU users that still + need to be converted to pass a lockdep expression. To prevent + false-positive splats, we keep it default disabled but once all + users are converted, we can remove this config option. + config TORTURE_TEST tristate default n diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 9dd5aeef6e70..38cbd616b381 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -61,9 +61,15 @@ module_param(rcu_normal_after_boot, int, 0); #ifdef CONFIG_DEBUG_LOCK_ALLOC /** - * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? + * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? + * @ret: Best guess answer if lockdep cannot be relied on * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * Returns true if lockdep must be ignored, in which case *ret contains + * the best guess described below. Otherwise returns false, in which + * case *ret tells the caller nothing and the caller should instead + * consult lockdep. + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling @@ -75,30 +81,44 @@ module_param(rcu_normal_after_boot, int, 0); * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. * - * Note that if the CPU is in the idle loop from an RCU point of - * view (ie: that we are in the section between rcu_idle_enter() and - * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU - * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs - * that are in such a section, considering these as in extended quiescent - * state, so such a CPU is effectively never in an RCU read-side critical - * section regardless of what RCU primitives it invokes. This state of - * affairs is required --- we need to keep an RCU-free window in idle - * where the CPU may possibly enter into low power mode. This way we can - * notice an extended quiescent state to other CPUs that started a grace - * period. Otherwise we would delay any grace period as long as we run in - * the idle task. + * Note that if the CPU is in the idle loop from an RCU point of view (ie: + * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) + * then rcu_read_lock_held() sets *ret to false even if the CPU did an + * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are + * in such a section, considering these as in extended quiescent state, + * so such a CPU is effectively never in an RCU read-side critical section + * regardless of what RCU primitives it invokes. This state of affairs is + * required --- we need to keep an RCU-free window in idle where the CPU may + * possibly enter into low power mode. This way we can notice an extended + * quiescent state to other CPUs that started a grace period. Otherwise + * we would delay any grace period as long as we run in the idle task. * - * Similarly, we avoid claiming an SRCU read lock held if the current + * Similarly, we avoid claiming an RCU read lock held if the current * CPU is offline. */ +static bool rcu_read_lock_held_common(bool *ret) +{ + if (!debug_lockdep_rcu_enabled()) { + *ret = 1; + return true; + } + if (!rcu_is_watching()) { + *ret = 0; + return true; + } + if (!rcu_lockdep_current_cpu_online()) { + *ret = 0; + return true; + } + return false; +} + int rcu_read_lock_sched_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return lock_is_held(&rcu_sched_lock_map) || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); @@ -257,12 +277,10 @@ NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); */ int rcu_read_lock_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return lock_is_held(&rcu_lock_map); } EXPORT_SYMBOL_GPL(rcu_read_lock_held); @@ -284,16 +302,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); */ int rcu_read_lock_bh_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); +int rcu_read_lock_any_held(void) +{ + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; + if (lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_sched_lock_map)) + return 1; + return !preemptible(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_any_held); + #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** From fbab8d6735e2643365040bd9e1057addc0d9b4cf Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 18:12:23 -0400 Subject: [PATCH 19/28] rcu/sync: Remove custom check for RCU readers The rcu/sync code currently does a special check for being in an RCU read-side critical section. With RCU consolidating flavors and the generic helper added earlier in this series, this check is no longer need. This commit switches to the generic helper, saving a couple of lines of code. Cc: Oleg Nesterov Acked-by: Oleg Nesterov Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rcu_sync.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index 9b83865d24f9..0027d4c8087c 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -31,9 +31,7 @@ struct rcu_sync { */ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) { - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && - !rcu_read_lock_bh_held() && - !rcu_read_lock_sched_held(), + RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), "suspicious rcu_sync_is_idle() usage"); return !READ_ONCE(rsp->gp_state); /* GP_IDLE */ } From 7fd69b0ba48a2b2d8e5b4f0945b28d3839a7705a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 18:12:24 -0400 Subject: [PATCH 20/28] ipv4: Add lockdep condition to fix for_each_entry() This commit applies the consolidated list_for_each_entry_rcu() support for lockdep conditions. Acked-by: David S. Miller Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- net/ipv4/fib_frontend.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e8bc939b56dd..dde77f72e03e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -124,7 +124,8 @@ struct fib_table *fib_get_table(struct net *net, u32 id) h = id & (FIB_TABLE_HASHSZ - 1); head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist, + lockdep_rtnl_is_held()) { if (tb->tb_id == id) return tb; } From e78a7614f3876ac649b3df608789cb6ef74d0480 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Jun 2019 07:46:43 -0700 Subject: [PATCH 21/28] idle: Prevent late-arriving interrupts from disrupting offline Scheduling-clock interrupts can arrive late in the CPU-offline process, after idle entry and the subsequent call to cpuhp_report_idle_dead(). Once execution passes the call to rcu_report_dead(), RCU is ignoring the CPU, which results in lockdep complaints when the interrupt handler uses RCU: ------------------------------------------------------------------------ ============================= WARNING: suspicious RCU usage 5.2.0-rc1+ #681 Not tainted ----------------------------- kernel/sched/fair.c:9542 suspicious rcu_dereference_check() usage! other info that might help us debug this: RCU used illegally from offline CPU! rcu_scheduler_active = 2, debug_locks = 1 no locks held by swapper/5/0. stack backtrace: CPU: 5 PID: 0 Comm: swapper/5 Not tainted 5.2.0-rc1+ #681 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x5e/0x8b trigger_load_balance+0xa8/0x390 ? tick_sched_do_timer+0x60/0x60 update_process_times+0x3b/0x50 tick_sched_handle+0x2f/0x40 tick_sched_timer+0x32/0x70 __hrtimer_run_queues+0xd3/0x3b0 hrtimer_interrupt+0x11d/0x270 ? sched_clock_local+0xc/0x74 smp_apic_timer_interrupt+0x79/0x200 apic_timer_interrupt+0xf/0x20 RIP: 0010:delay_tsc+0x22/0x50 Code: ff 0f 1f 80 00 00 00 00 65 44 8b 05 18 a7 11 48 0f ae e8 0f 31 48 89 d6 48 c1 e6 20 48 09 c6 eb 0e f3 90 65 8b 05 fe a6 11 48 <41> 39 c0 75 18 0f ae e8 0f 31 48 c1 e2 20 48 09 c2 48 89 d0 48 29 RSP: 0000:ffff8f92c0157ed0 EFLAGS: 00000212 ORIG_RAX: ffffffffffffff13 RAX: 0000000000000005 RBX: ffff8c861f356400 RCX: ffff8f92c0157e64 RDX: 000000321214c8cc RSI: 00000032120daa7f RDI: 0000000000260f15 RBP: 0000000000000005 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000000 R14: ffff8c861ee18000 R15: ffff8c861ee18000 cpuhp_report_idle_dead+0x31/0x60 do_idle+0x1d5/0x200 ? _raw_spin_unlock_irqrestore+0x2d/0x40 cpu_startup_entry+0x14/0x20 start_secondary+0x151/0x170 secondary_startup_64+0xa4/0xb0 ------------------------------------------------------------------------ This happens rarely, but can be forced by happen more often by placing delays in cpuhp_report_idle_dead() following the call to rcu_report_dead(). With this in place, the following rcutorture scenario reproduces the problem within a few minutes: tools/testing/selftests/rcutorture/bin/kvm.sh --cpus 8 --duration 5 --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" --configs "TREE04" This commit uses the crude but effective expedient of moving the disabling of interrupts within the idle loop to precede the cpu_is_offline() check. It also invokes tick_nohz_idle_stop_tick() instead of tick_nohz_idle_stop_tick_protected() to shut off the scheduling-clock interrupt. Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar [ paulmck: Revert tick_nohz_idle_stop_tick_protected() removal, new callers. ] Signed-off-by: Paul E. McKenney --- kernel/sched/idle.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80940939b733..e4bc4aa739b8 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -241,13 +241,14 @@ static void do_idle(void) check_pgt_cache(); rmb(); + local_irq_disable(); + if (cpu_is_offline(cpu)) { - tick_nohz_idle_stop_tick_protected(); + tick_nohz_idle_stop_tick(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } - local_irq_disable(); arch_cpu_idle_enter(); /* From b823cafa7501f946a37dce5aa1e576a0b2f31ed9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Jul 2019 08:05:10 -0700 Subject: [PATCH 22/28] rcu: Remove redundant "if" condition from rcu_gp_is_expedited() Because rcu_expedited_nesting is initialized to 1 and not decremented until just before init is spawned, rcu_expedited_nesting is guaranteed to be non-zero whenever rcu_scheduler_active == RCU_SCHEDULER_INIT. This commit therefore removes this redundant "if" equality test. Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/update.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 249517058b13..64e9cc8609e7 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -136,8 +136,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); */ bool rcu_gp_is_expedited(void) { - return rcu_expedited || atomic_read(&rcu_expedited_nesting) || - rcu_scheduler_active == RCU_SCHEDULER_INIT; + return rcu_expedited || atomic_read(&rcu_expedited_nesting); } EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); From 1d5087ab964d84e5a0cfe5059cf5e929127d573f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 May 2015 14:50:06 -0700 Subject: [PATCH 23/28] arm: Use common outgoing-CPU-notification code This commit removes the open-coded CPU-offline notification with new common code. In particular, this change avoids calling scheduler code using RCU from an offline CPU that RCU is ignoring. This is a minimal change. A more intrusive change might invoke the cpu_check_up_prepare() and cpu_set_state_online() functions at CPU-online time, which would allow onlining throw an error if the CPU did not go offline properly. Signed-off-by: Paul E. McKenney Cc: linux-arm-kernel@lists.infradead.org Cc: Russell King Cc: Mark Rutland Cc: Dietmar Eggemann --- arch/arm/kernel/smp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index aab8ba40ce38..4b0bab2607e4 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -264,15 +264,13 @@ int __cpu_disable(void) return 0; } -static DECLARE_COMPLETION(cpu_died); - /* * called on the thread which is asking for a CPU to be shutdown - * waits until shutdown has completed, or it is timed out. */ void __cpu_die(unsigned int cpu) { - if (!wait_for_completion_timeout(&cpu_died, msecs_to_jiffies(5000))) { + if (!cpu_wait_death(cpu, 5)) { pr_err("CPU%u: cpu didn't die\n", cpu); return; } @@ -319,7 +317,7 @@ void arch_cpu_idle_dead(void) * this returns, power and/or clocks can be removed at any point * from this CPU and its cache by platform_cpu_kill(). */ - complete(&cpu_died); + (void)cpu_report_death(); /* * Ensure that the cache lines associated with that completion are From 511b44f7598ce602f9efce687ca9eec013967d9b Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Mon, 29 Jul 2019 13:25:57 +0530 Subject: [PATCH 24/28] rcu: Fix spelling mistake "greate"->"great" This commit fixes a spelling mistake in file tree_exp.h. Signed-off-by: Mukesh Ojha Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 513b403b683b..d632cd019597 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -781,7 +781,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * other hand, if the CPU is not in an RCU read-side critical section, * the IPI handler reports the quiescent state immediately. * - * Although this is a greate improvement over previous expedited + * Although this is a great improvement over previous expedited * implementations, it is still unfriendly to real-time workloads, so is * thus not recommended for any sort of common-case code. In fact, if * you are using synchronize_rcu_expedited() in a loop, please restructure From ba31ebfa7b749906e0befcc1e0c0db5e7463d55e Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Mon, 5 Aug 2019 14:15:17 +0200 Subject: [PATCH 25/28] MAINTAINERS: Update e-mail address for Andrea Parri My @amarulasolutions.com address stopped working this July, so update to my @gmail.com address where you'll still be able to reach me. Signed-off-by: Andrea Parri Cc: Alan Stern Cc: Will Deacon Cc: Peter Zijlstra Cc: Boqun Feng Cc: Nicholas Piggin Cc: David Howells Cc: Jade Alglave Cc: Luc Maranget Cc: "Paul E. McKenney" Cc: Akira Yokosawa Cc: Daniel Lustig Signed-off-by: Paul E. McKenney --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6426db5198f0..527317026492 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9326,7 +9326,7 @@ F: drivers/misc/lkdtm/* LINUX KERNEL MEMORY CONSISTENCY MODEL (LKMM) M: Alan Stern -M: Andrea Parri +M: Andrea Parri M: Will Deacon M: Peter Zijlstra M: Boqun Feng From c2fa1e1bfa5b74558854a70b8afd797d43eb2743 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 18:12:25 -0400 Subject: [PATCH 26/28] driver/core: Convert to use built-in RCU list checking This commit applies the consolidated hlist_for_each_entry_rcu() support for lockdep conditions. Acked-by: Greg Kroah-Hartman Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- drivers/base/base.h | 1 + drivers/base/core.c | 12 ++++++++++++ drivers/base/power/runtime.c | 15 ++++++++++----- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/base/base.h b/drivers/base/base.h index b405436ee28e..0d32544b6f91 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -165,6 +165,7 @@ static inline int devtmpfs_init(void) { return 0; } /* Device links support */ extern int device_links_read_lock(void); extern void device_links_read_unlock(int idx); +extern int device_links_read_lock_held(void); extern int device_links_check_suppliers(struct device *dev); extern void device_links_driver_bound(struct device *dev); extern void device_links_driver_cleanup(struct device *dev); diff --git a/drivers/base/core.c b/drivers/base/core.c index 636058bbf48a..eede79630ceb 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -68,6 +68,11 @@ void device_links_read_unlock(int idx) { srcu_read_unlock(&device_links_srcu, idx); } + +int device_links_read_lock_held(void) +{ + return srcu_read_lock_held(&device_links_srcu); +} #else /* !CONFIG_SRCU */ static DECLARE_RWSEM(device_links_lock); @@ -91,6 +96,13 @@ void device_links_read_unlock(int not_used) { up_read(&device_links_lock); } + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +int device_links_read_lock_held(void) +{ + return lockdep_is_held(&device_links_lock); +} +#endif #endif /* !CONFIG_SRCU */ /** diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index b75335508d2c..50def99df970 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -287,7 +287,8 @@ static int rpm_get_suppliers(struct device *dev) { struct device_link *link; - list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) { + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, + device_links_read_lock_held()) { int retval; if (!(link->flags & DL_FLAG_PM_RUNTIME) || @@ -309,7 +310,8 @@ static void rpm_put_suppliers(struct device *dev) { struct device_link *link; - list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) { + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, + device_links_read_lock_held()) { if (READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND) continue; @@ -1640,7 +1642,8 @@ void pm_runtime_clean_up_links(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_rcu(link, &dev->links.consumers, s_node) { + list_for_each_entry_rcu(link, &dev->links.consumers, s_node, + device_links_read_lock_held()) { if (link->flags & DL_FLAG_STATELESS) continue; @@ -1662,7 +1665,8 @@ void pm_runtime_get_suppliers(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, + device_links_read_lock_held()) if (link->flags & DL_FLAG_PM_RUNTIME) { link->supplier_preactivated = true; refcount_inc(&link->rpm_active); @@ -1683,7 +1687,8 @@ void pm_runtime_put_suppliers(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, + device_links_read_lock_held()) if (link->supplier_preactivated) { link->supplier_preactivated = false; if (refcount_dec_not_one(&link->rpm_active)) From 842a56cf3eb00f717f9522766c0e7b71bafd5fc1 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 18:12:27 -0400 Subject: [PATCH 27/28] x86/pci: Pass lockdep condition to pcm_mmcfg_list iterator The pcm_mmcfg_list is traversed by list_for_each_entry_rcu() outside of an RCU read-side critical section, which is safe because the pci_mmcfg_lock is held. This commit therefore adds a lockdep expression to list_for_each_entry_rcu() in order t avoid lockdep warnings. Acked-by: Bjorn Helgaas Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- arch/x86/pci/mmconfig-shared.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 7389db538c30..6fa42e9c4e6f 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -29,6 +29,7 @@ static bool pci_mmcfg_running_state; static bool pci_mmcfg_arch_init_failed; static DEFINE_MUTEX(pci_mmcfg_lock); +#define pci_mmcfg_lock_held() lock_is_held(&(pci_mmcfg_lock).dep_map) LIST_HEAD(pci_mmcfg_list); @@ -54,7 +55,7 @@ static void list_add_sorted(struct pci_mmcfg_region *new) struct pci_mmcfg_region *cfg; /* keep list sorted by segment and starting bus number */ - list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) { + list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list, pci_mmcfg_lock_held()) { if (cfg->segment > new->segment || (cfg->segment == new->segment && cfg->start_bus >= new->start_bus)) { @@ -118,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus) { struct pci_mmcfg_region *cfg; - list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) + list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list, pci_mmcfg_lock_held()) if (cfg->segment == segment && cfg->start_bus <= bus && bus <= cfg->end_bus) return cfg; From bee6f87166e9c6b8d81a7570995bd637e8da485a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 18:12:28 -0400 Subject: [PATCH 28/28] acpi: Use built-in RCU list checking for acpi_ioremaps list This commit applies the consolidated list_for_each_entry_rcu() support for lockdep conditions. Acked-by: Rafael J. Wysocki Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- drivers/acpi/osl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 9c0edf2fc0dd..2f9d0d20b836 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,7 @@ struct acpi_ioremap { static LIST_HEAD(acpi_ioremaps); static DEFINE_MUTEX(acpi_ioremap_lock); +#define acpi_ioremap_lock_held() lock_is_held(&acpi_ioremap_lock.dep_map) static void __init acpi_request_region (struct acpi_generic_address *gas, unsigned int length, char *desc) @@ -206,7 +208,7 @@ acpi_map_lookup(acpi_physical_address phys, acpi_size size) { struct acpi_ioremap *map; - list_for_each_entry_rcu(map, &acpi_ioremaps, list) + list_for_each_entry_rcu(map, &acpi_ioremaps, list, acpi_ioremap_lock_held()) if (map->phys <= phys && phys + size <= map->phys + map->size) return map; @@ -249,7 +251,7 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size) { struct acpi_ioremap *map; - list_for_each_entry_rcu(map, &acpi_ioremaps, list) + list_for_each_entry_rcu(map, &acpi_ioremaps, list, acpi_ioremap_lock_held()) if (map->virt <= virt && virt + size <= map->virt + map->size) return map;