From 83a8df618eb04bd2819a758f3b409b1449862434 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 27 Nov 2008 01:46:33 +0100 Subject: [PATCH 01/15] tracing/function-graph-tracer: enhancements for the trace output Impact: enhance the output of the graph-tracer This patch applies some ideas of Ingo Molnar and Steven Rostedt. * Output leaf functions in one line with parenthesis, semicolon and duration output. * Add a second column (after cpu) for an overhead sign. if duration > 100 us, "!" if duration > 10 us, "+" else " " * Print output in us with remaining nanosec: u.n * Print duration on the right end, following the indentation of the functions. Use also visual clues: "-" on entry call (no duration to output) and "+" on return (duration output). The name of the tracer has been fixed as well: function-branch becomes function_branch. Here is an example of the new output: CPU[000] dequeue_entity() { - CPU[000] update_curr() { - CPU[000] update_min_vruntime(); + 0.512 us CPU[000] } + 1.504 us CPU[000] clear_buddies(); + 0.481 us CPU[000] update_min_vruntime(); + 0.504 us CPU[000] } + 4.557 us CPU[000] hrtick_update() { - CPU[000] hrtick_start_fair(); + 0.489 us CPU[000] } + 1.443 us CPU[000] + } + 14.655 us CPU[000] + } + 15.678 us CPU[000] + } + 16.686 us CPU[000] msecs_to_jiffies(); + 0.481 us CPU[000] put_prev_task_fair(); + 0.504 us CPU[000] pick_next_task_fair(); + 0.482 us CPU[000] pick_next_task_rt(); + 0.504 us CPU[000] pick_next_task_fair(); + 0.481 us CPU[000] pick_next_task_idle(); + 0.489 us CPU[000] _spin_trylock(); + 0.655 us CPU[000] _spin_unlock(); + 0.609 us CPU[000] ------------8<---------- thread bash-2794 ------------8<---------- CPU[000] finish_task_switch() { - CPU[000] _spin_unlock_irq(); + 0.722 us CPU[000] } + 2.369 us CPU[000] ! } + 501972.605 us CPU[000] ! } + 501973.763 us CPU[000] copy_from_read_buf() { - CPU[000] _spin_lock_irqsave(); + 0.670 us CPU[000] _spin_unlock_irqrestore(); + 0.699 us CPU[000] copy_to_user() { - CPU[000] might_fault() { - CPU[000] __might_sleep(); + 0.503 us CPU[000] } + 1.632 us CPU[000] __copy_to_user_ll(); + 0.542 us CPU[000] } + 3.858 us CPU[000] tty_audit_add_data() { - CPU[000] _spin_lock_irq(); + 0.609 us CPU[000] _spin_unlock_irq(); + 0.624 us CPU[000] } + 3.196 us CPU[000] _spin_lock_irqsave(); + 0.624 us CPU[000] _spin_unlock_irqrestore(); + 0.625 us CPU[000] + } + 13.611 us CPU[000] copy_from_read_buf() { - CPU[000] _spin_lock_irqsave(); + 0.624 us CPU[000] _spin_unlock_irqrestore(); + 0.616 us CPU[000] } + 2.820 us CPU[000] Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 176 +++++++++++++++++++++++++-- 1 file changed, 163 insertions(+), 13 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d31d695174aa..b958d60377b3 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -14,6 +14,10 @@ #include "trace.h" #define TRACE_GRAPH_INDENT 2 +/* Spaces between function call and time duration */ +#define TRACE_GRAPH_TIMESPACE_ENTRY " " +/* Spaces between function call and closing braces */ +#define TRACE_GRAPH_TIMESPACE_RET " " #define TRACE_GRAPH_PRINT_OVERRUN 0x1 static struct tracer_opt trace_opts[] = { @@ -63,26 +67,90 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) last_pid[cpu] = pid; comm = trace_find_cmdline(pid); - return trace_seq_printf(s, "\nCPU[%03d]" + return trace_seq_printf(s, "\nCPU[%03d] " " ------------8<---------- thread %s-%d" " ------------8<----------\n\n", cpu, comm, pid); } -static enum print_line_t -print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s, - struct trace_entry *ent, int cpu) +static bool +trace_branch_is_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *curr) { + struct ring_buffer_iter *ring_iter; + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *next; + + ring_iter = iter->buffer_iter[iter->cpu]; + + if (!ring_iter) + return false; + + event = ring_buffer_iter_peek(iter->buffer_iter[iter->cpu], NULL); + + if (!event) + return false; + + next = ring_buffer_event_data(event); + + if (next->ent.type != TRACE_GRAPH_RET) + return false; + + if (curr->ent.pid != next->ent.pid || + curr->graph_ent.func != next->ret.func) + return false; + + return true; +} + + +static inline int +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + unsigned long nsecs_rem = do_div(duration, 1000); + return trace_seq_printf(s, "+ %llu.%lu us\n", duration, nsecs_rem); +} + +/* Signal a overhead of time execution to the output */ +static int +print_graph_overhead(unsigned long long duration, struct trace_seq *s) +{ + /* Duration exceeded 100 msecs */ + if (duration > 100000ULL) + return trace_seq_printf(s, "! "); + + /* Duration exceeded 10 msecs */ + if (duration > 10000ULL) + return trace_seq_printf(s, "+ "); + + return trace_seq_printf(s, " "); +} + +/* Case of a leaf function on its call entry */ +static enum print_line_t +print_graph_entry_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, struct trace_seq *s) +{ + struct ftrace_graph_ret_entry *ret_entry; + struct ftrace_graph_ret *graph_ret; + struct ring_buffer_event *event; + struct ftrace_graph_ent *call; + unsigned long long duration; int i; int ret; - if (!verif_pid(s, ent->pid, cpu)) - return TRACE_TYPE_PARTIAL_LINE; + event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + ret_entry = ring_buffer_event_data(event); + graph_ret = &ret_entry->ret; + call = &entry->graph_ent; + duration = graph_ret->rettime - graph_ret->calltime; - ret = trace_seq_printf(s, "CPU[%03d] ", cpu); + /* Overhead */ + ret = print_graph_overhead(duration, s); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); if (!ret) @@ -93,18 +161,64 @@ print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "() {\n"); + ret = trace_seq_printf(s, "();"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + + /* Duration */ + ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_duration(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } static enum print_line_t -print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, - struct trace_entry *ent, int cpu) +print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, + struct trace_seq *s) { int i; int ret; + struct ftrace_graph_ent *call = &entry->graph_ent; + + /* No overhead */ + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = seq_print_ip_sym(s, call->func, 0); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "() {"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* No duration to print at this state */ + ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY "-\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, + struct trace_iterator *iter, int cpu) +{ + int ret; + struct trace_entry *ent = iter->ent; if (!verif_pid(s, ent->pid, cpu)) return TRACE_TYPE_PARTIAL_LINE; @@ -113,6 +227,36 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; + if (trace_branch_is_leaf(iter, field)) + return print_graph_entry_leaf(iter, field, s); + else + return print_graph_entry_nested(field, s); + +} + +static enum print_line_t +print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, + struct trace_entry *ent, int cpu) +{ + int i; + int ret; + unsigned long long duration = trace->rettime - trace->calltime; + + /* Pid */ + if (!verif_pid(s, ent->pid, cpu)) + return TRACE_TYPE_PARTIAL_LINE; + + /* Cpu */ + ret = trace_seq_printf(s, "CPU[%03d] ", cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Overhead */ + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); if (!ret) @@ -123,10 +267,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "%llu\n", trace->rettime - trace->calltime); + /* Duration */ + ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_RET); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + ret = print_graph_duration(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Overrun */ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { ret = trace_seq_printf(s, " (Overruns: %lu)\n", trace->overrun); @@ -146,7 +296,7 @@ print_graph_function(struct trace_iterator *iter) case TRACE_GRAPH_ENT: { struct ftrace_graph_ent_entry *field; trace_assign_type(field, entry); - return print_graph_entry(&field->graph_ent, s, entry, + return print_graph_entry(field, s, iter, iter->cpu); } case TRACE_GRAPH_RET: { @@ -160,7 +310,7 @@ print_graph_function(struct trace_iterator *iter) } static struct tracer graph_trace __read_mostly = { - .name = "function-graph", + .name = "function_graph", .init = graph_trace_init, .reset = graph_trace_reset, .print_line = print_graph_function, From 1a056155edd458eb93ef383fa8e5741d7e7c6360 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Nov 2008 00:42:46 +0100 Subject: [PATCH 02/15] tracing/function-graph-tracer: adjustments of the trace informations Impact: increase the visual qualities of the call-graph-tracer output This patch applies various trace output formatting changes: - CPU is now a decimal number, followed by a parenthesis. - Overhead is now on the second column (gives a good visibility) - Cost is now on the third column, can't exceed 9999.99 us. It is followed by a virtual line based on a "|" character. - Functions calls are now the last column on the right. This way, we haven't dynamic column (which flow is harder to follow) on its right. - CPU and Overhead have their own option flag. They are default-on but you can disable them easily: echo nofuncgraph-cpu > trace_options echo nofuncgraph-overhead > trace_options TODO: _ Refactoring of the thread switch output. _ Give a default-off option to output the thread and its pid on each row. _ Provide headers _ .... Here is an example of the new trace style: 0) | mutex_unlock() { 0) 0.639 us | __mutex_unlock_slowpath(); 0) 1.607 us | } 0) | remove_wait_queue() { 0) 0.616 us | _spin_lock_irqsave(); 0) 0.616 us | _spin_unlock_irqrestore(); 0) 2.779 us | } 0) 0.495 us | n_tty_set_room(); 0) ! 9999.999 us | } 0) | tty_ldisc_deref() { 0) 0.615 us | _spin_lock_irqsave(); 0) 0.616 us | _spin_unlock_irqrestore(); 0) 2.793 us | } 0) | current_fs_time() { 0) 0.488 us | current_kernel_time(); 0) 0.495 us | timespec_trunc(); 0) 2.486 us | } 0) ! 9999.999 us | } 0) ! 9999.999 us | } 0) ! 9999.999 us | } 0) | sys_read() { 0) 0.796 us | fget_light(); 0) | vfs_read() { 0) | rw_verify_area() { 0) | security_file_permission() { 0) 0.488 us | cap_file_permission(); 0) 1.720 us | } 0) 3. 4 us | } 0) | tty_read() { 0) 0.488 us | tty_paranoia_check(); 0) | tty_ldisc_ref_wait() { 0) | tty_ldisc_try() { 0) 0.615 us | _spin_lock_irqsave(); 0) 0.615 us | _spin_unlock_irqrestore(); 0) 5.436 us | } 0) 6.427 us | } Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 142 ++++++++++++++++++--------- 1 file changed, 93 insertions(+), 49 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b958d60377b3..596a3ee43053 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -14,20 +14,25 @@ #include "trace.h" #define TRACE_GRAPH_INDENT 2 -/* Spaces between function call and time duration */ -#define TRACE_GRAPH_TIMESPACE_ENTRY " " -/* Spaces between function call and closing braces */ -#define TRACE_GRAPH_TIMESPACE_RET " " +/* Flag options */ #define TRACE_GRAPH_PRINT_OVERRUN 0x1 +#define TRACE_GRAPH_PRINT_CPU 0x2 +#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 + static struct tracer_opt trace_opts[] = { - /* Display overruns or not */ - { TRACER_OPT(overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + /* Display overruns ? */ + { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + /* Display CPU ? */ + { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, + /* Display Overhead ? */ + { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { - .val = 0, /* Don't display overruns by default */ + /* Don't display overruns by default */ + .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, .opts = trace_opts }; @@ -56,6 +61,36 @@ static void graph_trace_reset(struct trace_array *tr) unregister_ftrace_graph(); } +static inline int log10_cpu(int nb) +{ + if (nb / 100) + return 3; + if (nb / 10) + return 2; + return 1; +} + +static enum print_line_t +print_graph_cpu(struct trace_seq *s, int cpu) +{ + int i; + int ret; + int log10_this = log10_cpu(cpu); + int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); + + + for (i = 0; i < log10_all - log10_this; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + ret = trace_seq_printf(s, "%d) ", cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + + /* If the pid changed since the last trace, output this event */ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) { @@ -67,8 +102,7 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) last_pid[cpu] = pid; comm = trace_find_cmdline(pid); - return trace_seq_printf(s, "\nCPU[%03d] " - " ------------8<---------- thread %s-%d" + return trace_seq_printf(s, "\n------------8<---------- thread %s-%d" " ------------8<----------\n\n", cpu, comm, pid); } @@ -86,7 +120,7 @@ trace_branch_is_leaf(struct trace_iterator *iter, if (!ring_iter) return false; - event = ring_buffer_iter_peek(iter->buffer_iter[iter->cpu], NULL); + event = ring_buffer_iter_peek(ring_iter, NULL); if (!event) return false; @@ -108,7 +142,7 @@ static inline int print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); - return trace_seq_printf(s, "+ %llu.%lu us\n", duration, nsecs_rem); + return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem); } /* Signal a overhead of time execution to the output */ @@ -136,8 +170,8 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ring_buffer_event *event; struct ftrace_graph_ent *call; unsigned long long duration; - int i; int ret; + int i; event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); ret_entry = ring_buffer_event_data(event); @@ -145,8 +179,19 @@ print_graph_entry_leaf(struct trace_iterator *iter, call = &entry->graph_ent; duration = graph_ret->rettime - graph_ret->calltime; + /* Must not exceed 8 characters: 9999.999 us */ + if (duration > 10000000ULL) + duration = 9999999ULL; + /* Overhead */ - ret = print_graph_overhead(duration, s); + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Duration */ + ret = print_graph_duration(duration, s); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -161,16 +206,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "();"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Duration */ - ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_duration(duration, s); + ret = trace_seq_printf(s, "();\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -186,9 +222,14 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, struct ftrace_graph_ent *call = &entry->graph_ent; /* No overhead */ - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No time */ + ret = trace_seq_printf(s, " | "); /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { @@ -201,12 +242,7 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "() {"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* No duration to print at this state */ - ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY "-\n"); + ret = trace_seq_printf(s, "() {\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -220,12 +256,16 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, int ret; struct trace_entry *ent = iter->ent; + /* Pid */ if (!verif_pid(s, ent->pid, cpu)) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "CPU[%03d] ", cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } if (trace_branch_is_leaf(iter, field)) return print_graph_entry_leaf(iter, field, s); @@ -242,17 +282,30 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int ret; unsigned long long duration = trace->rettime - trace->calltime; + /* Must not exceed 8 characters: xxxx.yyy us */ + if (duration > 10000000ULL) + duration = 9999999ULL; + /* Pid */ if (!verif_pid(s, ent->pid, cpu)) return TRACE_TYPE_PARTIAL_LINE; /* Cpu */ - ret = trace_seq_printf(s, "CPU[%03d] ", cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } /* Overhead */ - ret = print_graph_overhead(duration, s); + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Duration */ + ret = print_graph_duration(duration, s); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -263,16 +316,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "} "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Duration */ - ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_RET); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_duration(duration, s); + ret = trace_seq_printf(s, "}\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; From d51090b34602a20984ab0312ef04e47069c0aec6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 28 Nov 2008 09:55:16 +0100 Subject: [PATCH 03/15] tracing/function-graph-tracer: more output tweaks Impact: prettify the output some more Before: 0) | sys_read() { 0) 0.796 us | fget_light(); 0) | vfs_read() { 0) | rw_verify_area() { 0) | security_file_permission() { ------------8<---------- thread sshd-1755 ------------8<---------- After: 0) | sys_read() { 0) 0.796 us | fget_light(); 0) | vfs_read() { 0) | rw_verify_area() { 0) | security_file_permission() { ------------------------------------------ | 1) migration/0--1 => sshd-1755 ------------------------------------------ Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 47 +++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 596a3ee43053..894b50bca313 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -79,6 +79,19 @@ print_graph_cpu(struct trace_seq *s, int cpu) int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); + /* + * Start with a space character - to make it stand out + * to the right a bit when trace output is pasted into + * email: + */ + ret = trace_seq_printf(s, " "); + + /* + * Tricky - we space the CPU field according to the max + * number of online CPUs. On a 2-cpu system it would take + * a maximum of 1 digit - on a 128 cpu system it would + * take up to 3 digits: + */ for (i = 0; i < log10_all - log10_this; i++) { ret = trace_seq_printf(s, " "); if (!ret) @@ -86,7 +99,8 @@ print_graph_cpu(struct trace_seq *s, int cpu) } ret = trace_seq_printf(s, "%d) ", cpu); if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } @@ -94,17 +108,34 @@ print_graph_cpu(struct trace_seq *s, int cpu) /* If the pid changed since the last trace, output this event */ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) { - char *comm; + char *comm, *prev_comm; + pid_t prev_pid; + int ret; if (last_pid[cpu] != -1 && last_pid[cpu] == pid) return 1; + prev_pid = last_pid[cpu]; last_pid[cpu] = pid; - comm = trace_find_cmdline(pid); - return trace_seq_printf(s, "\n------------8<---------- thread %s-%d" - " ------------8<----------\n\n", - cpu, comm, pid); + comm = trace_find_cmdline(pid); + prev_comm = trace_find_cmdline(prev_pid); + +/* + * Context-switch trace line: + + ------------------------------------------ + | 1) migration/0--1 => sshd-1755 + ------------------------------------------ + + */ + ret = trace_seq_printf(s, + " ------------------------------------------\n"); + ret += trace_seq_printf(s, " | %d) %s-%d => %s-%d\n", + cpu, prev_comm, prev_pid, comm, pid); + ret += trace_seq_printf(s, + " ------------------------------------------\n\n"); + return ret; } static bool @@ -142,7 +173,7 @@ static inline int print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); - return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem); + return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem); } /* Signal a overhead of time execution to the output */ @@ -229,7 +260,7 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, } /* No time */ - ret = trace_seq_printf(s, " | "); + ret = trace_seq_printf(s, " | "); /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { From c7425acb42fff1e723b05fbf4ea11e9a455d95dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Fri, 28 Nov 2008 11:17:56 +0200 Subject: [PATCH 04/15] tracing, alpha: fix build: add missing #ifdef CONFIG_STACKTRACE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are architectures that still have no stacktrace support. Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5811e0a5f732..91887a280ab9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -983,6 +983,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, int pc) { +#ifdef CONFIG_STACKTRACE struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; @@ -1008,6 +1009,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, save_stack_trace_user(&trace); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +#endif } void __trace_userstack(struct trace_array *tr, From 50cdaf08a8ec1d7f43987705da7aff7cf949708f Mon Sep 17 00:00:00 2001 From: Liming Wang Date: Fri, 28 Nov 2008 12:13:21 +0800 Subject: [PATCH 05/15] ftrace: improve seq_operation of ftrace Impact: make ftrace position computing more sane First remove useless ->pos field. Then we needn't check seq_printf in .show like other place. Signed-off-by: Liming Wang Reviewed-by: Bruce Ashfield Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cbf8b09f63a5..08b536a2614e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -786,7 +786,6 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { - loff_t pos; struct ftrace_page *pg; unsigned idx; unsigned flags; @@ -811,6 +810,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) iter->pg = iter->pg->next; iter->idx = 0; goto retry; + } else { + iter->idx = -1; } } else { rec = &iter->pg->records[iter->idx++]; @@ -833,8 +834,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } spin_unlock(&ftrace_lock); - iter->pos = *pos; - return rec; } @@ -842,13 +841,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; - loff_t l = -1; - if (*pos > iter->pos) - *pos = iter->pos; + if (*pos > 0) { + if (iter->idx < 0) + return p; + (*pos)--; + iter->idx--; + } - l = *pos; - p = t_next(m, p, &l); + p = t_next(m, p, pos); return p; } @@ -859,21 +860,15 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { - struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = v; char str[KSYM_SYMBOL_LEN]; - int ret = 0; if (!rec) return 0; kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - ret = seq_printf(m, "%s\n", str); - if (ret < 0) { - iter->pos--; - iter->idx--; - } + seq_printf(m, "%s\n", str); return 0; } @@ -899,7 +894,6 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENOMEM; iter->pg = ftrace_pages_start; - iter->pos = 0; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -986,7 +980,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; - iter->pos = 0; iter->flags = enable ? FTRACE_ITER_FILTER : FTRACE_ITER_NOTRACE; From c072c24975ec4f0ccfcb6f5c8a8040b6eb75ef8f Mon Sep 17 00:00:00 2001 From: walimis Date: Fri, 28 Nov 2008 12:21:19 +0800 Subject: [PATCH 06/15] ftrace: improve documentation Impact: extend documentation with notice of using wild cards correctly We know that we can use wild cards to set set_ftrace_filter, but there's problem when using them naively such as: echo h* > /debug/tracing/set_ftrace_filter If there are files named with "h" prefix in current directory, echo "h*" will echo these filenames to set_ftrace_filter, not the intended "h*". For example: $ cat /debug/tracing/available_filter_functions |grep ^hr |wc -l 23 $ ls $ touch hraa hrdd $ ls hraa hrdd $ echo hr* > /debug/tracing/set_ftrace_filter $ cat /debug/tracing/set_ftrace_filter No output in /debug/tracing/set_ftrace_filter! If we use '' to escape wild cards, it works: $ ls hraa hrdd $ echo "hr*" > /debug/tracing/set_ftrace_filter $ cat /debug/tracing/set_ftrace_filter |wc -l 23 This problem can lead to unexpected result if current directory has a lot of files. Signed-off-by: walimis Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- Documentation/ftrace.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index de05042f11b9..803b1318b13d 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -1251,7 +1251,11 @@ These are the only wild cards which are supported. * will not work. - # echo hrtimer_* > /debug/tracing/set_ftrace_filter +Note: It is better to use quotes to enclose the wild cards, otherwise + the shell may expand the parameters into names of files in the local + directory. + + # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter Produces: @@ -1306,7 +1310,7 @@ Again, now we want to append. # echo sys_nanosleep > /debug/tracing/set_ftrace_filter # cat /debug/tracing/set_ftrace_filter sys_nanosleep - # echo hrtimer_* >> /debug/tracing/set_ftrace_filter + # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter # cat /debug/tracing/set_ftrace_filter hrtimer_run_queues hrtimer_run_pending From c7b0d17366d6e04a11470fc8d85f9fbac02671b9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Nov 2008 13:18:55 -0800 Subject: [PATCH 07/15] powerpc: ftrace, do nothing in mcount call for dyn ftrace Impact: quicken mcount calls that are not replaced by dyn ftrace Dynamic ftrace no longer does on the fly recording of mcount locations. The mcount locations are now found at compile time. The mcount function no longer needs to store registers and call a stub function. It can now just simply return. Since there are some functions that do not get converted to a nop (.init sections and other code that may disappear), this patch should help speed up that code. Also, the stub for mcount on PowerPC 32 can not be a simple branch link register like it is on PowerPC 64. According to the ABI specification: "The _mcount routine is required to restore the link register from the stack so that the profiling code can be inserted transparently, whether or not the profiled function saves the link register itself." This means that we must restore the link register that was used to make the call to mcount. The minimal mcount function for PPC32 ends up being: mcount: mflr r0 mtctr r0 lwz r0, 4(r1) mtlr r0 bctr Where we move the link register used to call mcount into the ctr register, and then restore the link register from the stack. Then we use the ctr register to jump back to the mcount caller. The r0 register is free for us to use. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/entry_32.S | 40 ++++++++-------------------------- arch/powerpc/kernel/entry_64.S | 12 ---------- 2 files changed, 9 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 7ecc0d1855c3..6f7eb7e00c79 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1162,39 +1162,17 @@ machine_check_in_rtas: #ifdef CONFIG_DYNAMIC_FTRACE _GLOBAL(mcount) _GLOBAL(_mcount) - stwu r1,-48(r1) - stw r3, 12(r1) - stw r4, 16(r1) - stw r5, 20(r1) - stw r6, 24(r1) - mflr r3 - stw r7, 28(r1) - mfcr r5 - stw r8, 32(r1) - stw r9, 36(r1) - stw r10,40(r1) - stw r3, 44(r1) - stw r5, 8(r1) - subi r3, r3, MCOUNT_INSN_SIZE - .globl mcount_call -mcount_call: - bl ftrace_stub - nop - lwz r6, 8(r1) - lwz r0, 44(r1) - lwz r3, 12(r1) + /* + * It is required that _mcount on PPC32 must preserve the + * link register. But we have r0 to play with. We use r0 + * to push the return address back to the caller of mcount + * into the ctr register, restore the link register and + * then jump back using the ctr register. + */ + mflr r0 mtctr r0 - lwz r4, 16(r1) - mtcr r6 - lwz r5, 20(r1) - lwz r6, 24(r1) - lwz r0, 52(r1) - lwz r7, 28(r1) - lwz r8, 32(r1) + lwz r0, 4(r1) mtlr r0 - lwz r9, 36(r1) - lwz r10,40(r1) - addi r1, r1, 48 bctr _GLOBAL(ftrace_caller) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index e6d52845854f..b00982e0d1ee 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -888,18 +888,6 @@ _GLOBAL(enter_prom) #ifdef CONFIG_DYNAMIC_FTRACE _GLOBAL(mcount) _GLOBAL(_mcount) - /* Taken from output of objdump from lib64/glibc */ - mflr r3 - stdu r1, -112(r1) - std r3, 128(r1) - subi r3, r3, MCOUNT_INSN_SIZE - .globl mcount_call -mcount_call: - bl ftrace_stub - nop - ld r0, 128(r1) - mtlr r0 - addi r1, r1, 112 blr _GLOBAL(ftrace_caller) From d9af12b72bfe2a4efc1d347e0ac1c669b85dcea9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2008 06:39:18 -0800 Subject: [PATCH 08/15] powerpc: ftrace, fix cast aliasing and add code verification Impact: clean up and robustness addition This patch addresses the comments made by Paul Mackerras. It removes the type casting between unsigned int and unsigned char pointers, and replaces them with a use of all unsigned int. Verification that the jump is indeed made to a trampoline has also been added. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/ftrace.c | 121 +++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 56 deletions(-) diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 3271cd698e4c..ea454a004406 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -162,26 +162,25 @@ static int __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned char replaced[MCOUNT_INSN_SIZE * 2]; - unsigned int *op = (unsigned *)&replaced; - unsigned char jmp[8]; - unsigned long *ptr = (unsigned long *)&jmp; + unsigned int op; + unsigned int jmp[5]; + unsigned long ptr; unsigned long ip = rec->ip; unsigned long tramp; int offset; /* read where this goes */ - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + if (probe_kernel_read(&op, (void *)ip, sizeof(int))) return -EFAULT; /* Make sure that that this is still a 24bit jump */ - if (!is_bl_op(*op)) { - printk(KERN_ERR "Not expected bl: opcode is %x\n", *op); + if (!is_bl_op(op)) { + printk(KERN_ERR "Not expected bl: opcode is %x\n", op); return -EINVAL; } /* lets find where the pointer goes */ - tramp = find_bl_target(ip, *op); + tramp = find_bl_target(ip, op); /* * On PPC64 the trampoline looks like: @@ -200,19 +199,25 @@ __ftrace_make_nop(struct module *mod, DEBUGP("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc); /* Find where the trampoline jumps to */ - if (probe_kernel_read(jmp, (void *)tramp, 8)) { + if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) { printk(KERN_ERR "Failed to read %lx\n", tramp); return -EFAULT; } - DEBUGP(" %08x %08x", - (unsigned)(*ptr >> 32), - (unsigned)*ptr); + DEBUGP(" %08x %08x", jmp[0], jmp[1]); - offset = (unsigned)jmp[2] << 24 | - (unsigned)jmp[3] << 16 | - (unsigned)jmp[6] << 8 | - (unsigned)jmp[7]; + /* verify that this is what we expect it to be */ + if (((jmp[0] & 0xffff0000) != 0x3d820000) || + ((jmp[1] & 0xffff0000) != 0x398c0000) || + (jmp[2] != 0xf8410028) || + (jmp[3] != 0xe96c0020) || + (jmp[4] != 0xe84c0028)) { + printk(KERN_ERR "Not a trampoline\n"); + return -EINVAL; + } + + offset = (unsigned)((unsigned short)jmp[0]) << 16 | + (unsigned)((unsigned short)jmp[1]); DEBUGP(" %x ", offset); @@ -225,13 +230,13 @@ __ftrace_make_nop(struct module *mod, return -EFAULT; } - DEBUGP(" %08x %08x\n", - (unsigned)(*ptr >> 32), - (unsigned)*ptr); + DEBUGP(" %08x %08x\n", jmp[0], jmp[1]); + + ptr = ((unsigned long)jmp[0] << 32) + jmp[1]; /* This should match what was called */ - if (*ptr != GET_ADDR(addr)) { - printk(KERN_ERR "addr does not match %lx\n", *ptr); + if (ptr != GET_ADDR(addr)) { + printk(KERN_ERR "addr does not match %lx\n", ptr); return -EINVAL; } @@ -240,11 +245,11 @@ __ftrace_make_nop(struct module *mod, * 0xe8, 0x41, 0x00, 0x28 ld r2,40(r1) * This needs to be turned to a nop too. */ - if (probe_kernel_read(replaced, (void *)(ip+4), MCOUNT_INSN_SIZE)) + if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE)) return -EFAULT; - if (*op != 0xe8410028) { - printk(KERN_ERR "Next line is not ld! (%08x)\n", *op); + if (op != 0xe8410028) { + printk(KERN_ERR "Next line is not ld! (%08x)\n", op); return -EINVAL; } @@ -261,9 +266,9 @@ __ftrace_make_nop(struct module *mod, * ld r2,40(r1) * 1: */ - op[0] = 0x48000008; /* b +8 */ + op = 0x48000008; /* b +8 */ - if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) + if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; return 0; @@ -274,46 +279,52 @@ static int __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned char replaced[MCOUNT_INSN_SIZE]; - unsigned int *op = (unsigned *)&replaced; - unsigned char jmp[8]; - unsigned int *ptr = (unsigned int *)&jmp; + unsigned int op; + unsigned int jmp[4]; unsigned long ip = rec->ip; unsigned long tramp; - int offset; - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) return -EFAULT; /* Make sure that that this is still a 24bit jump */ - if (!is_bl_op(*op)) { - printk(KERN_ERR "Not expected bl: opcode is %x\n", *op); + if (!is_bl_op(op)) { + printk(KERN_ERR "Not expected bl: opcode is %x\n", op); return -EINVAL; } /* lets find where the pointer goes */ - tramp = find_bl_target(ip, *op); + tramp = find_bl_target(ip, op); /* * On PPC32 the trampoline looks like: - * lis r11,sym@ha - * addi r11,r11,sym@l - * mtctr r11 - * bctr + * 0x3d, 0x60, 0x00, 0x00 lis r11,sym@ha + * 0x39, 0x6b, 0x00, 0x00 addi r11,r11,sym@l + * 0x7d, 0x69, 0x03, 0xa6 mtctr r11 + * 0x4e, 0x80, 0x04, 0x20 bctr */ DEBUGP("ip:%lx jumps to %lx", ip, tramp); /* Find where the trampoline jumps to */ - if (probe_kernel_read(jmp, (void *)tramp, 8)) { + if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) { printk(KERN_ERR "Failed to read %lx\n", tramp); return -EFAULT; } - DEBUGP(" %08x %08x ", ptr[0], ptr[1]); + DEBUGP(" %08x %08x ", jmp[0], jmp[1]); - tramp = (ptr[1] & 0xffff) | - ((ptr[0] & 0xffff) << 16); + /* verify that this is what we expect it to be */ + if (((jmp[0] & 0xffff0000) != 0x3d600000) || + ((jmp[1] & 0xffff0000) != 0x396b0000) || + (jmp[2] != 0x7d6903a6) || + (jmp[3] != 0x4e800420)) { + printk(KERN_ERR "Not a trampoline\n"); + return -EINVAL; + } + + tramp = (jmp[1] & 0xffff) | + ((jmp[0] & 0xffff) << 16); if (tramp & 0x8000) tramp -= 0x10000; @@ -326,9 +337,9 @@ __ftrace_make_nop(struct module *mod, return -EINVAL; } - op[0] = PPC_NOP_INSTR; + op = PPC_NOP_INSTR; - if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) + if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; return 0; @@ -384,13 +395,12 @@ int ftrace_make_nop(struct module *mod, static int __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned char replaced[MCOUNT_INSN_SIZE * 2]; - unsigned int *op = (unsigned *)&replaced; + unsigned int op[2]; unsigned long ip = rec->ip; unsigned long offset; /* read where this goes */ - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE * 2)) + if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2)) return -EFAULT; /* @@ -425,7 +435,7 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) DEBUGP("write to %lx\n", rec->ip); - if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE * 2)) + if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2)) return -EPERM; return 0; @@ -434,18 +444,17 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) static int __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned char replaced[MCOUNT_INSN_SIZE]; - unsigned int *op = (unsigned *)&replaced; + unsigned int op; unsigned long ip = rec->ip; unsigned long offset; /* read where this goes */ - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) return -EFAULT; /* It should be pointing to a nop */ - if (op[0] != PPC_NOP_INSTR) { - printk(KERN_ERR "Expected NOP but have %x\n", op[0]); + if (op != PPC_NOP_INSTR) { + printk(KERN_ERR "Expected NOP but have %x\n", op); return -EINVAL; } @@ -465,11 +474,11 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) } /* Set to "bl addr" */ - op[0] = branch_offset(offset); + op = branch_offset(offset); DEBUGP("write to %lx\n", rec->ip); - if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) + if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; return 0; From ec682cef2d2c1a25a198d32a87fe2649da671d1e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2008 10:22:48 -0800 Subject: [PATCH 09/15] powerpc: ftrace, added missing icache flush Impact: fix to PowerPC code modification After modifying code it is essential to flush the icache. This patch adds the missing flush. Reported-by: Paul Mackerras Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/ftrace.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index ea454a004406..a4640e4f1172 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -271,6 +271,9 @@ __ftrace_make_nop(struct module *mod, if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; + + flush_icache_range(ip, ip + 8); + return 0; } @@ -342,6 +345,8 @@ __ftrace_make_nop(struct module *mod, if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; + flush_icache_range(ip, ip + 8); + return 0; } #endif /* PPC64 */ @@ -438,6 +443,8 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2)) return -EPERM; + flush_icache_range(ip, ip + 8); + return 0; } #else @@ -481,6 +488,8 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; + flush_icache_range(ip, ip + 8); + return 0; } #endif /* CONFIG_PPC64 */ From 0029ff87529dff01a4b9c5bf380a0caacb5f7418 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2008 14:06:19 -0800 Subject: [PATCH 10/15] powerpc: ftrace, use create_branch Impact: clean up Paul Mackerras pointed out that the code to determine if the branch can reach the destination is incorrect. Michael Ellerman suggested to pull out the code from create_branch and use that. Simply using create_branch is probably the best. Reported-by: Michael Ellerman Reported-by: Paul Mackerras Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/ftrace.c | 54 ++++++++---------------------------- 1 file changed, 12 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index a4640e4f1172..5355244c99ff 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -114,19 +114,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, */ static int test_24bit_addr(unsigned long ip, unsigned long addr) { - long diff; - /* - * Can we get to addr from ip in 24 bits? - * (26 really, since we mulitply by 4 for 4 byte alignment) - */ - diff = addr - ip; - - /* - * Return true if diff is less than 1 << 25 - * and greater than -1 << 26. - */ - return (diff < (1 << 25)) && (diff > (-1 << 26)); + /* use the create_branch to verify that this offset can be branched */ + return create_branch((unsigned int *)ip, addr, 0); } static int is_bl_op(unsigned int op) @@ -134,11 +124,6 @@ static int is_bl_op(unsigned int op) return (op & 0xfc000003) == 0x48000001; } -static int test_offset(unsigned long offset) -{ - return (offset + 0x2000000 > 0x3ffffff) || ((offset & 3) != 0); -} - static unsigned long find_bl_target(unsigned long ip, unsigned int op) { static int offset; @@ -151,12 +136,6 @@ static unsigned long find_bl_target(unsigned long ip, unsigned int op) return ip + (long)offset; } -static unsigned int branch_offset(unsigned long offset) -{ - /* return "bl ip+offset" */ - return 0x48000001 | (offset & 0x03fffffc); -} - #ifdef CONFIG_PPC64 static int __ftrace_make_nop(struct module *mod, @@ -402,7 +381,6 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned int op[2]; unsigned long ip = rec->ip; - unsigned long offset; /* read where this goes */ if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2)) @@ -424,17 +402,14 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return -EINVAL; } - /* now calculate a jump to the ftrace caller trampoline */ - offset = rec->arch.mod->arch.tramp - ip; - - if (test_offset(offset)) { - printk(KERN_ERR "REL24 %li out of range!\n", - (long int)offset); + /* create the branch to the trampoline */ + op[0] = create_branch((unsigned int *)ip, + rec->arch.mod->arch.tramp, BRANCH_SET_LINK); + if (!op[0]) { + printk(KERN_ERR "REL24 out of range!\n"); return -EINVAL; } - /* Set to "bl addr" */ - op[0] = branch_offset(offset); /* ld r2,40(r1) */ op[1] = 0xe8410028; @@ -453,7 +428,6 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned int op; unsigned long ip = rec->ip; - unsigned long offset; /* read where this goes */ if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) @@ -471,18 +445,14 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return -EINVAL; } - /* now calculate a jump to the ftrace caller trampoline */ - offset = rec->arch.mod->arch.tramp - ip; - - if (test_offset(offset)) { - printk(KERN_ERR "REL24 %li out of range!\n", - (long int)offset); + /* create the branch to the trampoline */ + op = create_branch((unsigned int *)ip, + rec->arch.mod->arch.tramp, BRANCH_SET_LINK); + if (!op) { + printk(KERN_ERR "REL24 out of range!\n"); return -EINVAL; } - /* Set to "bl addr" */ - op = branch_offset(offset); - DEBUGP("write to %lx\n", rec->ip); if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) From f1eecf0e4f0796911cc076f38fcf05fea0b353d5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2008 15:54:46 -0500 Subject: [PATCH 11/15] powerpc/ppc32: static ftrace fixes for PPC32 Impact: fix for PowerPC 32 code There were some early init code that was not safe for static ftrace to boot on my PowerBook. This code must only use relative addressing, and static mcount performs a compare of the ftrace_trace_function pointer, and gets that with an absolute address. In the early init boot up code, this will cause a fault. This patch removes tracing from the files containing the offending functions. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/Makefile | 1 + arch/powerpc/lib/Makefile | 3 +++ 2 files changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 92673b43858d..d17edb4a2f9d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -17,6 +17,7 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog +CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog ifdef CONFIG_DYNAMIC_FTRACE # dynamic ftrace setup. diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index d69912c07ce7..8db35278a4b4 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -6,6 +6,9 @@ ifeq ($(CONFIG_PPC64),y) EXTRA_CFLAGS += -mno-minimal-toc endif +CFLAGS_REMOVE_code-patching.o = -pg +CFLAGS_REMOVE_feature-fixups.o = -pg + obj-y := string.o alloc.o \ checksum_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o From a838c2ec6ea1f18431da74dfe4978c57355b95f3 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 27 Nov 2008 16:14:44 +0800 Subject: [PATCH 12/15] markers: comment marker_synchronize_unregister() on data dependency Add document and comments on marker_synchronize_unregister(): it should be called before freeing resources that the probes depend on. Based on comments from Lai Jiangshan and Mathieu Desnoyers. Signed-off-by: Wu Fengguang Reviewed-by: Mathieu Desnoyers Reviewed-by: Lai Jiangshan Signed-off-by: Ingo Molnar --- Documentation/markers.txt | 15 ++++++++++----- include/linux/marker.h | 6 ++++-- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Documentation/markers.txt b/Documentation/markers.txt index 6d275e4ef385..d2b3d0e91b26 100644 --- a/Documentation/markers.txt +++ b/Documentation/markers.txt @@ -51,11 +51,16 @@ to call) for the specific marker through marker_probe_register() and can be activated by calling marker_arm(). Marker deactivation can be done by calling marker_disarm() as many times as marker_arm() has been called. Removing a probe is done through marker_probe_unregister(); it will disarm the probe. -marker_synchronize_unregister() must be called before the end of the module exit -function to make sure there is no caller left using the probe. This, and the -fact that preemption is disabled around the probe call, make sure that probe -removal and module unload are safe. See the "Probe example" section below for a -sample probe module. + +marker_synchronize_unregister() must be called between probe unregistration and +the first occurrence of +- the end of module exit function, + to make sure there is no caller left using the probe; +- the free of any resource used by the probes, + to make sure the probes wont be accessing invalid data. +This, and the fact that preemption is disabled around the probe call, make sure +that probe removal and module unload are safe. See the "Probe example" section +below for a sample probe module. The marker mechanism supports inserting multiple instances of the same marker. Markers can be put in inline functions, inlined static functions, and diff --git a/include/linux/marker.h b/include/linux/marker.h index 34c14bc957f5..b85e74ca782f 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -211,8 +211,10 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe, /* * marker_synchronize_unregister must be called between the last marker probe - * unregistration and the end of module exit to make sure there is no caller - * executing a probe when it is freed. + * unregistration and the first one of + * - the end of module exit function + * - the free of any resource used by the probes + * to ensure the code and data are valid for any possibly running probes. */ #define marker_synchronize_unregister() synchronize_sched() From 604094f4615180f71da799e7e5b191f5c2a42a28 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 28 Nov 2008 18:03:22 +0100 Subject: [PATCH 13/15] vfs, seqfile: export mangle_path() generally mangle_path() is trivial enough to make export restrictions on it pointless - so change the export from EXPORT_SYMBOL_GPL to EXPORT_SYMBOL. Signed-off-by: Ingo Molnar Acked-by: Al Viro --- fs/seq_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index f03220d7891b..16c211558c22 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -387,7 +387,7 @@ char *mangle_path(char *s, char *p, char *esc) } return NULL; } -EXPORT_SYMBOL_GPL(mangle_path); +EXPORT_SYMBOL(mangle_path); /* * return the absolute path of 'dentry' residing in mount 'mnt'. From 65c6dc6adbe7ee0acf207445243400a68c77af15 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 29 Nov 2008 04:12:46 +0100 Subject: [PATCH 14/15] tracing/branch-tracer: include missing irqflags.h Impact: fix build error on branch tracer This should fix a build error reported on alpha in linux-next: CC kernel/trace/trace_branch.o kernel/trace/trace_branch.c: In function 'probe_likely_condition': kernel/trace/trace_branch.c:44: error: implicit declaration of function 'raw_local_irq_save' kernel/trace/trace_branch.c:76: error: implicit declaration of function 'raw_local_irq_restore' Unfortunately, I can't test it since I don't have any Alpha build environment. Reported-by: Alexey Dobriyan Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_branch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 877ee88e6a74..bc972753568d 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include From f08340c5d68ab621f377c108637e2d8e95b3e5d4 Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Sat, 29 Nov 2008 15:43:32 +0530 Subject: [PATCH 15/15] tracepoints: Documentation TPPROTO misspelt in Documentation/tracepoints.txt Impact: fix typo in documentation TPPROTO is misspelt in Documentation/tracepoints.txt Kept me wondering what was wrong, when I was trying to add a new tracepoint subsystem. Signed-off-by: Nikanth Karthikesan Signed-off-by: Ingo Molnar --- Documentation/tracepoints.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index 2d42241a25c3..6f0a044f5b5e 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -45,7 +45,7 @@ In include/trace/subsys.h : #include DECLARE_TRACE(subsys_eventname, - TPPTOTO(int firstarg, struct task_struct *p), + TPPROTO(int firstarg, struct task_struct *p), TPARGS(firstarg, p)); In subsys/file.c (where the tracing statement must be added) : @@ -66,7 +66,7 @@ Where : - subsys is the name of your subsystem. - eventname is the name of the event to trace. -- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the +- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the function called by this tracepoint. - TPARGS(firstarg, p) are the parameters names, same as found in the