diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index b0b814d75ca1..60740e8ecb37 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -836,7 +836,6 @@ Provides counts of softirq handlers serviced since boot time, for each cpu. TASKLET: 0 0 0 290 SCHED: 27035 26983 26971 26746 HRTIMER: 0 0 0 0 - RCU: 1678 1769 2178 2250 1.3 IDE devices in /proc/ide diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index bea0ac750712..6c12989839d9 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -414,7 +414,6 @@ enum TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, - RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS }; diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 1c09820df585..ae045ca7d356 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -20,8 +20,7 @@ struct softirq_action; softirq_name(BLOCK_IOPOLL), \ softirq_name(TASKLET), \ softirq_name(SCHED), \ - softirq_name(HRTIMER), \ - softirq_name(RCU)) + softirq_name(HRTIMER)) /** * irq_handler_entry - called immediately before the irq action handler diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0ac1cc03f935..18e33313873e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -47,6 +47,8 @@ #include #include #include +#include +#include #include "rcutree.h" @@ -82,6 +84,20 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); +/* + * Control variables for per-CPU and per-rcu_node kthreads. These + * handle all flavors of RCU. + */ +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); +static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq); +static DEFINE_PER_CPU(char, rcu_cpu_has_work); +static char rcu_kthreads_spawnable; + +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp); +static void invoke_rcu_kthread(void); + +#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ + /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -1009,6 +1025,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) /* * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy * and move all callbacks from the outgoing CPU to the current one. + * There can only be one CPU hotplug operation at a time, so no other + * CPU can be attempting to update rcu_cpu_kthread_task. */ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) { @@ -1017,6 +1035,14 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; + struct task_struct *t; + + /* Stop the CPU's kthread. */ + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t != NULL) { + per_cpu(rcu_cpu_kthread_task, cpu) = NULL; + kthread_stop(t); + } /* Exclude any attempts to start a new grace period. */ raw_spin_lock_irqsave(&rsp->onofflock, flags); @@ -1054,6 +1080,19 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp); + + /* + * If there are no more online CPUs for this rcu_node structure, + * kill the rcu_node structure's kthread. Otherwise, adjust its + * affinity. + */ + t = rnp->node_kthread_task; + if (t != NULL && + rnp->qsmaskinit == 0) { + kthread_stop(t); + rnp->node_kthread_task = NULL; + } else + rcu_node_kthread_setaffinity(rnp); } /* @@ -1151,7 +1190,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Re-raise the RCU softirq if there are callbacks remaining. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); } /* @@ -1197,7 +1236,7 @@ void rcu_check_callbacks(int cpu, int user) } rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); } #ifdef CONFIG_SMP @@ -1361,7 +1400,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) /* * Do softirq processing for the current CPU. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static void rcu_process_callbacks(void) { __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); @@ -1372,6 +1411,281 @@ static void rcu_process_callbacks(struct softirq_action *unused) rcu_needs_cpu_flush(); } +/* + * Wake up the current CPU's kthread. This replaces raise_softirq() + * in earlier versions of RCU. Note that because we are running on + * the current CPU with interrupts disabled, the rcu_cpu_kthread_task + * cannot disappear out from under us. + */ +static void invoke_rcu_kthread(void) +{ + unsigned long flags; + wait_queue_head_t *q; + int cpu; + + local_irq_save(flags); + cpu = smp_processor_id(); + per_cpu(rcu_cpu_has_work, cpu) = 1; + if (per_cpu(rcu_cpu_kthread_task, cpu) == NULL) { + local_irq_restore(flags); + return; + } + q = &per_cpu(rcu_cpu_wq, cpu); + wake_up(q); + local_irq_restore(flags); +} + +/* + * Timer handler to initiate the waking up of per-CPU kthreads that + * have yielded the CPU due to excess numbers of RCU callbacks. + */ +static void rcu_cpu_kthread_timer(unsigned long arg) +{ + unsigned long flags; + struct rcu_data *rdp = (struct rcu_data *)arg; + struct rcu_node *rnp = rdp->mynode; + struct task_struct *t; + + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->wakemask |= rdp->grpmask; + t = rnp->node_kthread_task; + if (t == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + wake_up_process(t); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Drop to non-real-time priority and yield, but only after posting a + * timer that will cause us to regain our real-time priority if we + * remain preempted. Either way, we restore our real-time priority + * before returning. + */ +static void rcu_yield(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu); + struct sched_param sp; + struct timer_list yield_timer; + + setup_timer_on_stack(&yield_timer, rcu_cpu_kthread_timer, (unsigned long)rdp); + mod_timer(&yield_timer, jiffies + 2); + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); + schedule(); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + del_timer(&yield_timer); +} + +/* + * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. + * This can happen while the corresponding CPU is either coming online + * or going offline. We cannot wait until the CPU is fully online + * before starting the kthread, because the various notifier functions + * can wait for RCU grace periods. So we park rcu_cpu_kthread() until + * the corresponding CPU is online. + * + * Return 1 if the kthread needs to stop, 0 otherwise. + * + * Caller must disable bh. This function can momentarily enable it. + */ +static int rcu_cpu_kthread_should_stop(int cpu) +{ + while (cpu_is_offline(cpu) || + !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || + smp_processor_id() != cpu) { + if (kthread_should_stop()) + return 1; + local_bh_enable(); + schedule_timeout_uninterruptible(1); + if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + local_bh_disable(); + } + return 0; +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the + * earlier RCU softirq. + */ +static int rcu_cpu_kthread(void *arg) +{ + int cpu = (int)(long)arg; + unsigned long flags; + int spincnt = 0; + wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu); + char work; + char *workp = &per_cpu(rcu_cpu_has_work, cpu); + + for (;;) { + wait_event_interruptible(*wqp, + *workp != 0 || kthread_should_stop()); + local_bh_disable(); + if (rcu_cpu_kthread_should_stop(cpu)) { + local_bh_enable(); + break; + } + local_irq_save(flags); + work = *workp; + *workp = 0; + local_irq_restore(flags); + if (work) + rcu_process_callbacks(); + local_bh_enable(); + if (*workp != 0) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + rcu_yield(cpu); + spincnt = 0; + } + } + return 0; +} + +/* + * Spawn a per-CPU kthread, setting up affinity and priority. + * Because the CPU hotplug lock is held, no other CPU will be attempting + * to manipulate rcu_cpu_kthread_task. There might be another CPU + * attempting to access it during boot, but the locking in kthread_bind() + * will enforce sufficient ordering. + */ +static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) +{ + struct sched_param sp; + struct task_struct *t; + + if (!rcu_kthreads_spawnable || + per_cpu(rcu_cpu_kthread_task, cpu) != NULL) + return 0; + t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); + if (IS_ERR(t)) + return PTR_ERR(t); + kthread_bind(t, cpu); + WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); + per_cpu(rcu_cpu_kthread_task, cpu) = t; + wake_up_process(t); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + return 0; +} + +/* + * Per-rcu_node kthread, which is in charge of waking up the per-CPU + * kthreads when needed. We ignore requests to wake up kthreads + * for offline CPUs, which is OK because force_quiescent_state() + * takes care of this case. + */ +static int rcu_node_kthread(void *arg) +{ + int cpu; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp = (struct rcu_node *)arg; + struct sched_param sp; + struct task_struct *t; + + for (;;) { + wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0 || + kthread_should_stop()); + if (kthread_should_stop()) + break; + raw_spin_lock_irqsave(&rnp->lock, flags); + mask = rnp->wakemask; + rnp->wakemask = 0; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { + if ((mask & 0x1) == 0) + continue; + preempt_disable(); + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (!cpu_online(cpu) || t == NULL) { + preempt_enable(); + continue; + } + per_cpu(rcu_cpu_has_work, cpu) = 1; + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + preempt_enable(); + } + } + return 0; +} + +/* + * Set the per-rcu_node kthread's affinity to cover all CPUs that are + * served by the rcu_node in question. + */ +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp) +{ + cpumask_var_t cm; + int cpu; + unsigned long mask = rnp->qsmaskinit; + + if (rnp->node_kthread_task == NULL || + rnp->qsmaskinit == 0) + return; + if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + return; + cpumask_clear(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) + if (mask & 0x1) + cpumask_set_cpu(cpu, cm); + set_cpus_allowed_ptr(rnp->node_kthread_task, cm); + free_cpumask_var(cm); +} + +/* + * Spawn a per-rcu_node kthread, setting priority and affinity. + */ +static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + int rnp_index = rnp - &rsp->node[0]; + struct sched_param sp; + struct task_struct *t; + + if (!rcu_kthreads_spawnable || + rnp->qsmaskinit == 0 || + rnp->node_kthread_task != NULL) + return 0; + t = kthread_create(rcu_node_kthread, (void *)rnp, "rcun%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + rnp->node_kthread_task = t; + wake_up_process(t); + sp.sched_priority = 99; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + return 0; +} + +/* + * Spawn all kthreads -- called as soon as the scheduler is running. + */ +static int __init rcu_spawn_kthreads(void) +{ + int cpu; + struct rcu_node *rnp; + + rcu_kthreads_spawnable = 1; + for_each_possible_cpu(cpu) { + init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu)); + per_cpu(rcu_cpu_has_work, cpu) = 0; + if (cpu_online(cpu)) + (void)rcu_spawn_one_cpu_kthread(cpu); + } + rcu_for_each_leaf_node(&rcu_sched_state, rnp) { + init_waitqueue_head(&rnp->node_wq); + (void)rcu_spawn_one_node_kthread(&rcu_sched_state, rnp); + } + return 0; +} +early_initcall(rcu_spawn_kthreads); + static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_state *rsp) @@ -1771,6 +2085,19 @@ static void __cpuinit rcu_online_cpu(int cpu) rcu_preempt_init_percpu_data(cpu); } +static void __cpuinit rcu_online_kthreads(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ + if (rcu_kthreads_spawnable) { + (void)rcu_spawn_one_cpu_kthread(cpu); + if (rnp->node_kthread_task == NULL) + (void)rcu_spawn_one_node_kthread(&rcu_sched_state, rnp); + } +} + /* * Handle CPU online/offline notification events. */ @@ -1778,11 +2105,17 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; + struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu); + struct rcu_node *rnp = rdp->mynode; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); + rcu_online_kthreads(cpu); + break; + case CPU_ONLINE: + rcu_node_kthread_setaffinity(rnp); break; case CPU_DYING: case CPU_DYING_FROZEN: @@ -1923,7 +2256,6 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5a439c180e69..c0213802d164 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -111,6 +111,7 @@ struct rcu_node { /* elements that need to drain to allow the */ /* current expedited grace period to */ /* complete (only for TREE_PREEMPT_RCU). */ + unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ @@ -134,6 +135,13 @@ struct rcu_node { /* if there is no such task. If there */ /* is no current expedited grace period, */ /* then there can cannot be any such task. */ + struct task_struct *node_kthread_task; + /* kthread that takes care of this rcu_node */ + /* structure, for example, awakening the */ + /* per-CPU kthreads as needed. */ + wait_queue_head_t node_wq; + /* Wait queue on which to park the per-node */ + /* kthread. */ } ____cacheline_internodealigned_in_smp; /* diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 774f010a4619..b9bd69a5a4fe 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1206,7 +1206,7 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a - * raise_softirq() to cause rcu_process_callbacks() to be invoked later. + * invoke_rcu_kthread() to cause rcu_process_callbacks() to be invoked later. * The per-cpu rcu_dyntick_drain variable controls the sequencing. */ int rcu_needs_cpu(int cpu) @@ -1257,7 +1257,7 @@ int rcu_needs_cpu(int cpu) /* If RCU callbacks are still pending, RCU still needs this CPU. */ if (c) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); return c; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 174f976c2874..13960170cad4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", - "TASKLET", "SCHED", "HRTIMER", "RCU" + "TASKLET", "SCHED", "HRTIMER" }; /* diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 0a7ed5b5e281..1e88485c16a0 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -2187,7 +2187,6 @@ static const struct flag flags[] = { { "TASKLET_SOFTIRQ", 6 }, { "SCHED_SOFTIRQ", 7 }, { "HRTIMER_SOFTIRQ", 8 }, - { "RCU_SOFTIRQ", 9 }, { "HRTIMER_NORESTART", 0 }, { "HRTIMER_RESTART", 1 },