From e48c178814b4a33f84f62d01f5a601ebd57fbba8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jul 2016 09:18:30 +0200 Subject: [PATCH] perf/core: Optimize perf_pmu_sched_task() For perf record -b, which requires the pmu::sched_task callback the current code is rather expensive: 7.68% sched-pipe [kernel.vmlinux] [k] perf_pmu_sched_task 5.95% sched-pipe [kernel.vmlinux] [k] __switch_to 5.20% sched-pipe [kernel.vmlinux] [k] __intel_pmu_disable_all 3.95% sched-pipe perf [.] worker_thread The problem is that it will iterate all registered PMUs, most of which will not have anything to do. Avoid this by keeping an explicit list of PMUs that have requested the callback. The perf_sched_cb_{inc,dec}() functions already takes the required pmu argument, and now that these functions are no longer called from NMI context we can use them to manage a list. With this patch applied the function doesn't show up in the top 4 anymore (it dropped to 18th place). 6.67% sched-pipe [kernel.vmlinux] [k] __switch_to 6.18% sched-pipe [kernel.vmlinux] [k] __intel_pmu_disable_all 3.92% sched-pipe [kernel.vmlinux] [k] switch_mm_irqs_off 3.71% sched-pipe perf [.] worker_thread Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 +++ kernel/events/core.c | 43 +++++++++++++++++++++----------------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2b6b43cc0dd5..529c41fa73c8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -774,6 +774,9 @@ struct perf_cpu_context { #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; #endif + + struct list_head sched_cb_entry; + int sched_cb_usage; }; struct perf_output_handle { diff --git a/kernel/events/core.c b/kernel/events/core.c index 57aff715039f..803481cb6cbd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2805,13 +2805,26 @@ unlock: } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); + void perf_sched_cb_dec(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + this_cpu_dec(perf_sched_cb_usages); + + if (!--cpuctx->sched_cb_usage) + list_del(&cpuctx->sched_cb_entry); } + void perf_sched_cb_inc(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + if (!cpuctx->sched_cb_usage++) + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + this_cpu_inc(perf_sched_cb_usages); } @@ -2829,34 +2842,24 @@ static void perf_pmu_sched_task(struct task_struct *prev, { struct perf_cpu_context *cpuctx; struct pmu *pmu; - unsigned long flags; if (prev == next) return; - local_irq_save(flags); + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { + pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ - rcu_read_lock(); + if (WARN_ON_ONCE(!pmu->sched_task)) + continue; - list_for_each_entry_rcu(pmu, &pmus, entry) { - if (pmu->sched_task) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + pmu->sched_task(cpuctx->task_ctx, sched_in); - perf_pmu_disable(pmu); - - pmu->sched_task(cpuctx->task_ctx, sched_in); - - perf_pmu_enable(pmu); - - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } - - rcu_read_unlock(); - - local_irq_restore(flags); } static void perf_event_switch(struct task_struct *task, @@ -10393,6 +10396,8 @@ static void __init perf_event_init_all_cpus(void) INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); + + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } }