From 86c2786994bd7c0d4b525bbfbe42ac540d0b8166 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 13 Aug 2015 12:40:57 +0300 Subject: [PATCH] perf intel-pt: Add support for PERF_RECORD_SWITCH Add support for selecting and processing PERF_RECORD_SWITCH events for use by Intel PT. If they are available, they will be used in preference to sched_switch events. This enables an unprivileged user to trace multi-threaded or multi-process workloads with any level of perf_event_paranoid. However it depends on kernel support for PERF_RECORD_SWITCH. Without this patch, tracing a multi-threaded workload will decode without error but all the data will be attributed to the main thread. Without this patch, tracing a multi-process workload will result in decoder errors because the decoder will not know which executable is executing. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1439458857-30636-3-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/intel-pt.c | 55 ++++++++++-- tools/perf/util/intel-pt.c | 129 ++++++++++++++++++++++------ 2 files changed, 151 insertions(+), 33 deletions(-) diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 2ca10d796c0b..b02af064f0f9 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -624,13 +624,49 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * threads. */ if (have_timing_info && !cpu_map__empty(cpus)) { - err = intel_pt_track_switches(evlist); - if (err == -EPERM) - pr_debug2("Unable to select sched:sched_switch\n"); - else if (err) - return err; - else - ptr->have_sched_switch = 1; + if (perf_can_record_switch_events()) { + bool cpu_wide = !target__none(&opts->target) && + !target__has_task(&opts->target); + + if (!cpu_wide && perf_can_record_cpu_wide()) { + struct perf_evsel *switch_evsel; + + err = parse_events(evlist, "dummy:u", NULL); + if (err) + return err; + + switch_evsel = perf_evlist__last(evlist); + + switch_evsel->attr.freq = 0; + switch_evsel->attr.sample_period = 1; + switch_evsel->attr.context_switch = 1; + + switch_evsel->system_wide = true; + switch_evsel->no_aux_samples = true; + switch_evsel->immediate = true; + + perf_evsel__set_sample_bit(switch_evsel, TID); + perf_evsel__set_sample_bit(switch_evsel, TIME); + perf_evsel__set_sample_bit(switch_evsel, CPU); + + opts->record_switch_events = false; + ptr->have_sched_switch = 3; + } else { + opts->record_switch_events = true; + if (cpu_wide) + ptr->have_sched_switch = 3; + else + ptr->have_sched_switch = 2; + } + } else { + err = intel_pt_track_switches(evlist); + if (err == -EPERM) + pr_debug2("Unable to select sched:sched_switch\n"); + else if (err) + return err; + else + ptr->have_sched_switch = 1; + } } if (intel_pt_evsel) { @@ -663,8 +699,11 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, tracking_evsel->attr.sample_period = 1; /* In per-cpu case, always need the time of mmap events etc */ - if (!cpu_map__empty(cpus)) + if (!cpu_map__empty(cpus)) { perf_evsel__set_sample_bit(tracking_evsel, TIME); + /* And the CPU for switch events */ + perf_evsel__set_sample_bit(tracking_evsel, CPU); + } } /* diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index bb41c20e6005..2968b37ed9b0 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1145,11 +1145,13 @@ static int intel_pt_sample(struct intel_pt_queue *ptq) return 0; } -static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip) +static u64 intel_pt_switch_ip(struct intel_pt *pt, u64 *ptss_ip) { + struct machine *machine = pt->machine; struct map *map; struct symbol *sym, *start; u64 ip, switch_ip = 0; + const char *ptss; if (ptss_ip) *ptss_ip = 0; @@ -1177,8 +1179,13 @@ static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip) if (!switch_ip || !ptss_ip) return 0; + if (pt->have_sched_switch == 1) + ptss = "perf_trace_sched_switch"; + else + ptss = "__perf_event_task_sched_out"; + for (sym = start; sym; sym = dso__next_symbol(sym)) { - if (!strcmp(sym->name, "perf_trace_sched_switch")) { + if (!strcmp(sym->name, ptss)) { ip = map->unmap_ip(map, sym->start); if (ip >= map->start && ip < map->end) { *ptss_ip = ip; @@ -1198,11 +1205,11 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp) if (!pt->kernel_start) { pt->kernel_start = machine__kernel_start(pt->machine); - if (pt->per_cpu_mmaps && pt->have_sched_switch && + if (pt->per_cpu_mmaps && + (pt->have_sched_switch == 1 || pt->have_sched_switch == 3) && !pt->timeless_decoding && intel_pt_tracing_kernel(pt) && !pt->sampling_mode) { - pt->switch_ip = intel_pt_switch_ip(pt->machine, - &pt->ptss_ip); + pt->switch_ip = intel_pt_switch_ip(pt, &pt->ptss_ip); if (pt->switch_ip) { intel_pt_log("switch_ip: %"PRIx64" ptss_ip: %"PRIx64"\n", pt->switch_ip, pt->ptss_ip); @@ -1387,31 +1394,18 @@ static struct intel_pt_queue *intel_pt_cpu_to_ptq(struct intel_pt *pt, int cpu) return NULL; } -static int intel_pt_process_switch(struct intel_pt *pt, - struct perf_sample *sample) +static int intel_pt_sync_switch(struct intel_pt *pt, int cpu, pid_t tid, + u64 timestamp) { struct intel_pt_queue *ptq; - struct perf_evsel *evsel; - pid_t tid; - int cpu, err; - - evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id); - if (evsel != pt->switch_evsel) - return 0; - - tid = perf_evsel__intval(evsel, sample, "next_pid"); - cpu = sample->cpu; - - intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n", - cpu, tid, sample->time, perf_time_to_tsc(sample->time, - &pt->tc)); + int err; if (!pt->sync_switch) - goto out; + return 1; ptq = intel_pt_cpu_to_ptq(pt, cpu); if (!ptq) - goto out; + return 1; switch (ptq->switch_state) { case INTEL_PT_SS_NOT_TRACING: @@ -1424,7 +1418,7 @@ static int intel_pt_process_switch(struct intel_pt *pt, return 0; case INTEL_PT_SS_EXPECTING_SWITCH_EVENT: if (!ptq->on_heap) { - ptq->timestamp = perf_time_to_tsc(sample->time, + ptq->timestamp = perf_time_to_tsc(timestamp, &pt->tc); err = auxtrace_heap__add(&pt->heap, ptq->queue_nr, ptq->timestamp); @@ -1441,10 +1435,76 @@ static int intel_pt_process_switch(struct intel_pt *pt, default: break; } -out: + + return 1; +} + +static int intel_pt_process_switch(struct intel_pt *pt, + struct perf_sample *sample) +{ + struct perf_evsel *evsel; + pid_t tid; + int cpu, ret; + + evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id); + if (evsel != pt->switch_evsel) + return 0; + + tid = perf_evsel__intval(evsel, sample, "next_pid"); + cpu = sample->cpu; + + intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n", + cpu, tid, sample->time, perf_time_to_tsc(sample->time, + &pt->tc)); + + ret = intel_pt_sync_switch(pt, cpu, tid, sample->time); + if (ret <= 0) + return ret; + return machine__set_current_tid(pt->machine, cpu, -1, tid); } +static int intel_pt_context_switch(struct intel_pt *pt, union perf_event *event, + struct perf_sample *sample) +{ + bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT; + pid_t pid, tid; + int cpu, ret; + + cpu = sample->cpu; + + if (pt->have_sched_switch == 3) { + if (!out) + return 0; + if (event->header.type != PERF_RECORD_SWITCH_CPU_WIDE) { + pr_err("Expecting CPU-wide context switch event\n"); + return -EINVAL; + } + pid = event->context_switch.next_prev_pid; + tid = event->context_switch.next_prev_tid; + } else { + if (out) + return 0; + pid = sample->pid; + tid = sample->tid; + } + + if (tid == -1) { + pr_err("context_switch event has no tid\n"); + return -EINVAL; + } + + intel_pt_log("context_switch: cpu %d pid %d tid %d time %"PRIu64" tsc %#"PRIx64"\n", + cpu, pid, tid, sample->time, perf_time_to_tsc(sample->time, + &pt->tc)); + + ret = intel_pt_sync_switch(pt, cpu, tid, sample->time); + if (ret <= 0) + return ret; + + return machine__set_current_tid(pt->machine, cpu, pid, tid); +} + static int intel_pt_process_itrace_start(struct intel_pt *pt, union perf_event *event, struct perf_sample *sample) @@ -1515,6 +1575,9 @@ static int intel_pt_process_event(struct perf_session *session, err = intel_pt_process_switch(pt, sample); else if (event->header.type == PERF_RECORD_ITRACE_START) err = intel_pt_process_itrace_start(pt, event, sample); + else if (event->header.type == PERF_RECORD_SWITCH || + event->header.type == PERF_RECORD_SWITCH_CPU_WIDE) + err = intel_pt_context_switch(pt, event, sample); intel_pt_log("event %s (%u): cpu %d time %"PRIu64" tsc %#"PRIx64"\n", perf_event__name(event->header.type), event->header.type, @@ -1777,6 +1840,18 @@ static struct perf_evsel *intel_pt_find_sched_switch(struct perf_evlist *evlist) return NULL; } +static bool intel_pt_find_switch(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel; + + evlist__for_each(evlist, evsel) { + if (evsel->attr.context_switch) + return true; + } + + return false; +} + static const char * const intel_pt_info_fmts[] = { [INTEL_PT_PMU_TYPE] = " PMU Type %"PRId64"\n", [INTEL_PT_TIME_SHIFT] = " Time Shift %"PRIu64"\n", @@ -1888,6 +1963,10 @@ int intel_pt_process_auxtrace_info(union perf_event *event, pr_err("%s: missing sched_switch event\n", __func__); goto err_delete_thread; } + } else if (pt->have_sched_switch == 2 && + !intel_pt_find_switch(session->evlist)) { + pr_err("%s: missing context_switch attribute flag\n", __func__); + goto err_delete_thread; } if (session->itrace_synth_opts && session->itrace_synth_opts->set) {