From e725c731e3bb1e892e7b564c945b121cb41d1087 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 3 Mar 2017 13:37:33 -0500 Subject: [PATCH 01/60] tracing: Split tracing initialization into two for early initialization Create an early_trace_init() function that will initialize the buffers and allow for ealier use of trace_printk(). This will also allow for future work to have function tracing start earlier at boot up. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 2 ++ init/main.c | 5 ++++- kernel/trace/trace.c | 6 +++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 3633e8beff39..569db5589851 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -42,8 +42,10 @@ /* Main tracing buffer and events set up */ #ifdef CONFIG_TRACING void trace_init(void); +void early_trace_init(void); #else static inline void trace_init(void) { } +static inline void early_trace_init(void) { } #endif struct module; diff --git a/init/main.c b/init/main.c index f9c9d9948203..81a49e8d54cc 100644 --- a/init/main.c +++ b/init/main.c @@ -545,6 +545,9 @@ asmlinkage __visible void __init start_kernel(void) trap_init(); mm_init(); + /* trace_printk can be enabled here */ + early_trace_init(); + /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() @@ -570,7 +573,7 @@ asmlinkage __visible void __init start_kernel(void) rcu_init(); - /* trace_printk() and trace points may be used after this */ + /* Trace events are available after this */ trace_init(); context_tracking_init(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f35109514a01..6757561d9617 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7999,7 +7999,7 @@ out: return ret; } -void __init trace_init(void) +void __init early_trace_init(void) { if (tracepoint_printk) { tracepoint_print_iter = @@ -8010,6 +8010,10 @@ void __init trace_init(void) static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); +} + +void __init trace_init(void) +{ trace_event_init(); } From f631718de3ca24a9ae03595e937fe0b64cfaf456 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 3 Mar 2017 13:43:34 -0500 Subject: [PATCH 02/60] ftrace: Move ftrace_init() to right after memory initialization Initialize the ftrace records immediately after memory initialization, as that is all that is required for the records to be created. This will allow for future work to get function tracing started earlier in the boot process. Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index 81a49e8d54cc..c0137b916aa1 100644 --- a/init/main.c +++ b/init/main.c @@ -545,6 +545,8 @@ asmlinkage __visible void __init start_kernel(void) trap_init(); mm_init(); + ftrace_init(); + /* trace_printk can be enabled here */ early_trace_init(); @@ -673,8 +675,6 @@ asmlinkage __visible void __init start_kernel(void) efi_free_boot_services(); } - ftrace_init(); - /* Do the rest non-__init'ed, we're now alive */ rest_init(); } From 9afecfbb95198ec3ea6d52cca4711ea314f29ec6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 24 Mar 2017 17:59:10 -0400 Subject: [PATCH 03/60] tracing: Postpone tracer start-up tests till the system is more robust As tracing can now be enabled very early in boot up, even before some critical system services (like scheduling), do not run the tracer selftests until after early_initcall() is performed. If a tracer is registered before such time, it is saved off in a list and the test is run when the system is able to handle more diverse functions. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 71 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6757561d9617..68a6f78f6862 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1424,6 +1424,28 @@ static int wait_on_pipe(struct trace_iterator *iter, bool full) } #ifdef CONFIG_FTRACE_STARTUP_TEST +static bool selftests_can_run; + +struct trace_selftests { + struct list_head list; + struct tracer *type; +}; + +static LIST_HEAD(postponed_selftests); + +static int save_selftest(struct tracer *type) +{ + struct trace_selftests *selftest; + + selftest = kmalloc(sizeof(*selftest), GFP_KERNEL); + if (!selftest) + return -ENOMEM; + + selftest->type = type; + list_add(&selftest->list, &postponed_selftests); + return 0; +} + static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; @@ -1433,6 +1455,14 @@ static int run_tracer_selftest(struct tracer *type) if (!type->selftest || tracing_selftest_disabled) return 0; + /* + * If a tracer registers early in boot up (before scheduling is + * initialized and such), then do not run its selftests yet. + * Instead, run it a little later in the boot process. + */ + if (!selftests_can_run) + return save_selftest(type); + /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current @@ -1482,6 +1512,47 @@ static int run_tracer_selftest(struct tracer *type) printk(KERN_CONT "PASSED\n"); return 0; } + +static __init int init_trace_selftests(void) +{ + struct trace_selftests *p, *n; + struct tracer *t, **last; + int ret; + + selftests_can_run = true; + + mutex_lock(&trace_types_lock); + + if (list_empty(&postponed_selftests)) + goto out; + + pr_info("Running postponed tracer tests:\n"); + + list_for_each_entry_safe(p, n, &postponed_selftests, list) { + ret = run_tracer_selftest(p->type); + /* If the test fails, then warn and remove from available_tracers */ + if (ret < 0) { + WARN(1, "tracer: %s failed selftest, disabling\n", + p->type->name); + last = &trace_types; + for (t = trace_types; t; t = t->next) { + if (t == p->type) { + *last = t->next; + break; + } + last = &t->next; + } + } + list_del(&p->list); + kfree(p); + } + + out: + mutex_unlock(&trace_types_lock); + + return 0; +} +early_initcall(init_trace_selftests); #else static inline int run_tracer_selftest(struct tracer *type) { From dbeafd0d6131d0f6ae8cd7551f5f4bf8c54aa49a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 3 Mar 2017 13:48:42 -0500 Subject: [PATCH 04/60] ftrace: Have function tracing start in early boot up Register the function tracer right after the tracing buffers are initialized in early boot up. This will allow function tracing to begin early if it is enabled via the kernel command line. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 +++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_functions.c | 3 +-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68a6f78f6862..4fa8e8f3c765 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8036,6 +8036,9 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); + /* Function tracing may start here (via kernel command line) */ + init_function_trace(); + /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ae1cce91fead..571acee52a32 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -896,6 +896,7 @@ int using_ftrace_ops_list_func(void); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d_tracer); +int init_function_trace(void); #else static inline int ftrace_trace_task(struct trace_array *tr) { @@ -914,6 +915,7 @@ ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } +static inline int init_function_trace(void) { return 0; } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 0efa00d80623..4199ca61b0e5 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -687,9 +687,8 @@ static inline int init_func_cmd_traceon(void) } #endif /* CONFIG_DYNAMIC_FTRACE */ -static __init int init_function_trace(void) +__init int init_function_trace(void) { init_func_cmd_traceon(); return register_tracer(&function_trace); } -core_initcall(init_function_trace); From 42c269c88dc146982a54a8267f71abc99f12852a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 3 Mar 2017 16:15:39 -0500 Subject: [PATCH 05/60] ftrace: Allow for function tracing to record init functions on boot up Adding a hook into free_reserve_area() that informs ftrace that boot up init text is being free, lets ftrace safely remove those init functions from its records, which keeps ftrace from trying to modify text that no longer exists. Note, this still does not allow for tracing .init text of modules, as modules require different work for freeing its init code. Link: http://lkml.kernel.org/r/1488502497.7212.24.camel@linux.intel.com Cc: linux-mm@kvack.org Cc: Vlastimil Babka Cc: Mel Gorman Cc: Peter Zijlstra Requested-by: Todd Brandt Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 5 +++++ include/linux/init.h | 4 +++- kernel/trace/ftrace.c | 44 +++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 4 ++++ scripts/recordmcount.c | 1 + scripts/recordmcount.pl | 1 + 6 files changed, 58 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 569db5589851..0276a2c487e6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -146,6 +146,10 @@ struct ftrace_ops_hash { struct ftrace_hash *filter_hash; struct mutex regex_lock; }; + +void ftrace_free_mem(void *start, void *end); +#else +static inline void ftrace_free_mem(void *start, void *end) { } #endif /* @@ -262,6 +266,7 @@ static inline int ftrace_nr_registered_ops(void) } static inline void clear_ftrace_function(void) { } static inline void ftrace_kill(void) { } +static inline void ftrace_free_mem(void *start, void *end) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_STACK_TRACER diff --git a/include/linux/init.h b/include/linux/init.h index 79af0962fd52..94769d687cf0 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -39,7 +39,7 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) __cold notrace __latent_entropy +#define __init __section(.init.text) __cold __inittrace __latent_entropy #define __initdata __section(.init.data) #define __initconst __section(.init.rodata) #define __exitdata __section(.exit.data) @@ -68,8 +68,10 @@ #ifdef MODULE #define __exitused +#define __inittrace notrace #else #define __exitused __used +#define __inittrace #endif #define __exit __section(.exit.text) __exitused __cold notrace diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b9691ee8f6c1..0556a202c055 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5262,6 +5262,50 @@ void ftrace_module_init(struct module *mod) } #endif /* CONFIG_MODULES */ +void ftrace_free_mem(void *start_ptr, void *end_ptr) +{ + unsigned long start = (unsigned long)start_ptr; + unsigned long end = (unsigned long)end_ptr; + struct ftrace_page **last_pg = &ftrace_pages_start; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + struct dyn_ftrace key; + int order; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + mutex_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + again: + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (!rec) + continue; + pg->index--; + if (!pg->index) { + *last_pg = pg->next; + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + kfree(pg); + pg = container_of(last_pg, struct ftrace_page, next); + if (!(*last_pg)) + ftrace_pages = pg; + continue; + } + memmove(rec, rec + 1, + (pg->index - (rec - pg->records)) * sizeof(*rec)); + /* More than one function may be in this block */ + goto again; + } + mutex_unlock(&ftrace_lock); +} + void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6cbde310abed..eee82bfb7cd8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -6605,6 +6606,9 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) void *pos; unsigned long pages = 0; + /* This may be .init text, inform ftrace to remove it */ + ftrace_free_mem(start, end); + start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index aeb34223167c..16e086dcc567 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -412,6 +412,7 @@ static int is_mcounted_section_name(char const *const txtname) { return strcmp(".text", txtname) == 0 || + strcmp(".init.text", txtname) == 0 || strcmp(".ref.text", txtname) == 0 || strcmp(".sched.text", txtname) == 0 || strcmp(".spinlock.text", txtname) == 0 || diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 0b6002b36f20..1633c3e6c0b9 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -130,6 +130,7 @@ if ($inputfile =~ m,kernel/trace/ftrace\.o$,) { # Acceptable sections to record. my %text_sections = ( ".text" => 1, + ".init.text" => 1, ".ref.text" => 1, ".sched.text" => 1, ".spinlock.text" => 1, From af0009fc16a45d091f896794e97a6457f9a7eddf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 16 Mar 2017 11:01:06 -0400 Subject: [PATCH 06/60] tracing: Move trace_handle_return() out of line Currently trace_handle_return() looks like this: static inline enum print_line_t trace_handle_return(struct trace_seq *s) { return trace_seq_has_overflowed(s) ? TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; } Where trace_seq_overflowed(s) is: static inline bool trace_seq_has_overflowed(struct trace_seq *s) { return s->full || seq_buf_has_overflowed(&s->seq); } And seq_buf_has_overflowed(&s->seq) is: static inline bool seq_buf_has_overflowed(struct seq_buf *s) { return s->len > s->size; } Making trace_handle_return() into: return (s->full || (s->seq->len > s->seq->size)) ? TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; One would think this is not an issue to keep as an inline. But because this is used in the TRACE_EVENT() macro, it is extended for every tracepoint in the system. Taking a look at a single tracepoint x86_irq_vector (was the first one I randomly chosen). As trace_handle_return is used in the TRACE_EVENT() macro of trace_raw_output_##call() we disassemble trace_raw_output_x86_irq_vector and do a diff: - is the original + is the out-of-line code I removed identical lines that were different just due to different addresses. --- /tmp/irq-vec-orig 2017-03-16 09:12:48.569384851 -0400 +++ /tmp/irq-vec-ool 2017-03-16 09:13:39.378153385 -0400 @@ -6,27 +6,23 @@ 53 push %rbx 48 89 fb mov %rdi,%rbx 4c 8b a7 c0 20 00 00 mov 0x20c0(%rdi),%r12 e8 f7 72 13 00 callq ffffffff81155c80 83 f8 01 cmp $0x1,%eax 74 05 je ffffffff8101e993 5b pop %rbx 41 5c pop %r12 5d pop %rbp c3 retq 41 8b 54 24 08 mov 0x8(%r12),%edx - 48 8d bb 98 10 00 00 lea 0x1098(%rbx),%rdi + 48 81 c3 98 10 00 00 add $0x1098,%rbx - 48 c7 c6 7b 8a a0 81 mov $0xffffffff81a08a7b,%rsi + 48 c7 c6 ab 8a a0 81 mov $0xffffffff81a08aab,%rsi - e8 c5 85 13 00 callq ffffffff81156f70 === here's the start of the main difference === + 48 89 df mov %rbx,%rdi + e8 62 7e 13 00 callq ffffffff81156810 - 8b 93 b8 20 00 00 mov 0x20b8(%rbx),%edx - 31 c0 xor %eax,%eax - 85 d2 test %edx,%edx - 75 11 jne ffffffff8101e9c8 - 48 8b 83 a8 20 00 00 mov 0x20a8(%rbx),%rax - 48 39 83 a0 20 00 00 cmp %rax,0x20a0(%rbx) - 0f 93 c0 setae %al + 48 89 df mov %rbx,%rdi + e8 4a c5 12 00 callq ffffffff8114af00 5b pop %rbx - 0f b6 c0 movzbl %al,%eax === end === 41 5c pop %r12 5d pop %rbp c3 retq If you notice, the original has 22 bytes of text more than the out of line version. As this is for every TRACE_EVENT() defined in the system, this can become quite large. text data bss dec hex filename 8690305 5450490 1298432 15439227 eb957b vmlinux-orig 8681725 5450490 1298432 15430647 eb73f7 vmlinux-handle This change has a total of 8580 bytes in savings. $ objdump -dr /tmp/vmlinux-orig | grep '^[0-9a-f]* Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 11 +---------- kernel/trace/trace.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 0af63c4381b9..a556805eff8a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -138,16 +138,7 @@ enum print_line_t { TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; -/* - * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq - * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function - * simplifies those functions and keeps them in sync. - */ -static inline enum print_line_t trace_handle_return(struct trace_seq *s) -{ - return trace_seq_has_overflowed(s) ? - TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; -} +enum print_line_t trace_handle_return(struct trace_seq *s); void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4fa8e8f3c765..b5d4b80f2d45 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1998,6 +1998,18 @@ void tracing_record_cmdline(struct task_struct *tsk) __this_cpu_write(trace_cmdline_save, false); } +/* + * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq + * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function + * simplifies those functions and keeps them in sync. + */ +enum print_line_t trace_handle_return(struct trace_seq *s) +{ + return trace_seq_has_overflowed(s) ? + TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; +} +EXPORT_SYMBOL_GPL(trace_handle_return); + void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc) From 2b87965a1e6b6051435315d3f04627a5dbad979c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 28 Mar 2017 09:58:21 -0400 Subject: [PATCH 07/60] ftrace/x86: Do no run CPU sync when there is only one CPU online Moving enabling of function tracing to early boot, even before scheduling is enabled, means that it is not safe to enable interrupts. When function tracing was enabled at boot up, it use to happen after scheduling and the other CPUs were brought up. That required running a sync across all CPUs when modifying the function hook locations in the code. To do the synchronization, interrupts had to be enabled. Now function tracing can be started before the other CPUs are brought up, and enabling interrupts in that case is dangerous. As only tho boot CPU is active, there is no reason to run the synchronization. If the online CPU count is one, do not bother doing the synchronization. This removes the need to enable interrupts. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/ftrace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8f3d9cf26ff9..70945fbd1258 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -533,7 +533,13 @@ static void do_sync_core(void *data) static void run_sync(void) { - int enable_irqs = irqs_disabled(); + int enable_irqs; + + /* No need to sync if there's only one CPU */ + if (num_online_cpus() == 1) + return; + + enable_irqs = irqs_disabled(); /* We may be called with interrupts disabled (on bootup). */ if (enable_irqs) From c1bc5919f6741cc4b0c83e3058b3d65d76c943e3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 29 Mar 2017 11:38:13 -0400 Subject: [PATCH 08/60] ftrace: Clean up __seq_open_private() return check The return status check of __seq_open_private() is rather strange: iter = __seq_open_private(); if (iter) { /* do stuff */ } return iter ? 0 : -ENOMEM; It makes much more sense to do the return of failure right away: iter = __seq_open_private(); if (!iter) return -ENOMEM; /* do stuff */ return 0; This clean up will make updates to this code a bit nicer. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0556a202c055..527c4d3e8d7f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3355,12 +3355,13 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENODEV; iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); - if (iter) { - iter->pg = ftrace_pages_start; - iter->ops = &global_ops; - } + if (!iter) + return -ENOMEM; - return iter ? 0 : -ENOMEM; + iter->pg = ftrace_pages_start; + iter->ops = &global_ops; + + return 0; } static int @@ -3369,13 +3370,14 @@ ftrace_enabled_open(struct inode *inode, struct file *file) struct ftrace_iterator *iter; iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); - if (iter) { - iter->pg = ftrace_pages_start; - iter->flags = FTRACE_ITER_ENABLED; - iter->ops = &global_ops; - } + if (!iter) + return -ENOMEM; - return iter ? 0 : -ENOMEM; + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; + + return 0; } /** From c20489dad156dd9919ebd854bbace46dbd2576a3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 29 Mar 2017 14:55:49 -0400 Subject: [PATCH 09/60] ftrace: Assign iter->hash to filter or notrace hashes on seq read Instead of testing if the hash to use is the filter_hash or the notrace_hash at each iteration, do the test at open, and set the iter->hash to point to the corresponding filter or notrace hash. Then use that directly instead of testing which hash needs to be used each iteration. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 527c4d3e8d7f..3165b7f840e6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3157,7 +3157,6 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = iter->ops; struct dyn_ftrace *rec = NULL; if (unlikely(ftrace_disabled)) @@ -3181,11 +3180,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if (((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || - - ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || + if (((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && + !ftrace_lookup_ip(iter->hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && !(rec->flags & FTRACE_FL_ENABLED))) { @@ -3213,7 +3209,6 @@ static void reset_iter_read(struct ftrace_iterator *iter) static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = iter->ops; void *p = NULL; loff_t l; @@ -3233,10 +3228,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if ((iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->func_hash->filter_hash)) || - (iter->flags & FTRACE_ITER_NOTRACE && - ftrace_hash_empty(ops->func_hash->notrace_hash))) { + if ((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && + ftrace_hash_empty(iter->hash)) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -3442,7 +3435,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, ret = -ENOMEM; goto out_unlock; } - } + } else + iter->hash = hash; if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; @@ -4526,6 +4520,9 @@ int ftrace_regex_release(struct inode *inode, struct file *file) free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); + } else { + /* For read only, the hash is the ops hash */ + iter->hash = NULL; } mutex_unlock(&iter->ops->func_hash->regex_lock); From 2d71d98900b8a4bd58c3ca92e404d5e3701de874 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 29 Mar 2017 21:40:49 -0400 Subject: [PATCH 10/60] ftrace: Return NULL at end of t_start() instead of calling t_hash_start() The loop in t_start() of calling t_next() will call t_hash_start() if the pos is beyond the functions and enters the hash items. There's no reason to check if p is NULL and call t_hash_start(), as that would be redundant. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3165b7f840e6..421530831ddd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3255,7 +3255,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) } if (!p) - return t_hash_start(m, pos); + return NULL; return iter; } From 43ff926a0c3a0cfd6aa313c3232420f009ab43e8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 30 Mar 2017 16:51:43 -0400 Subject: [PATCH 11/60] ftrace: Update func_pos in t_start() when all functions are enabled If all functions are enabled, there's a comment displayed in the file to denote that: # cd /sys/kernel/debug/tracing # cat set_ftrace_filter #### all functions enabled #### If a function trigger is set, those are displayed as well: # echo schedule:traceoff >> /debug/tracing/set_ftrace_filter # cat set_ftrace_filter #### all functions enabled #### schedule:traceoff:unlimited But if you read that file with dd, the output can change: # dd if=/debug/tracing/set_ftrace_filter bs=1 #### all functions enabled #### 32+0 records in 32+0 records out 32 bytes copied, 7.0237e-05 s, 456 kB/s This is because the "pos" variable is updated for the comment, but func_pos is not. "func_pos" is used by the triggers (or hashes) to know how many functions were printed and it bases its index from the pos - func_pos. func_pos should be 1 to count for the comment printed. But since it is not, t_hash_start() thinks that one trigger was already printed. The cat gets to t_hash_start() via t_next() and not t_start() which updates both pos and func_pos. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 421530831ddd..d4b18ce9ba88 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3230,6 +3230,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) */ if ((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && ftrace_hash_empty(iter->hash)) { + iter->func_pos = 1; /* Account for the message */ if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; From 5bd84629a7a0e2462c28ca52e213ebe27fadfee8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 29 Mar 2017 22:45:18 -0400 Subject: [PATCH 12/60] ftrace: Create separate t_func_next() to simplify the function / hash logic I noticed that if I use dd to read the set_ftrace_filter file that the first hash command is repeated. # cd /sys/kernel/debug/tracing # echo schedule > set_ftrace_filter # echo do_IRQ >> set_ftrace_filter # echo schedule:traceoff >> set_ftrace_filter # echo do_IRQ:traceoff >> set_ftrace_filter # cat set_ftrace_filter schedule do_IRQ schedule:traceoff:unlimited do_IRQ:traceoff:unlimited # dd if=set_ftrace_filter bs=1 schedule do_IRQ schedule:traceoff:unlimited schedule:traceoff:unlimited do_IRQ:traceoff:unlimited 98+0 records in 98+0 records out 98 bytes copied, 0.00265011 s, 37.0 kB/s This is due to the way t_start() calls t_next() as well as the seq_file calls t_next() and the state is slightly different between the two. Namely, t_start() will call t_next() with a local "pos" variable. By separating out the function listing from t_next() into its own function, we can have better control of outputting the functions and the hash of triggers. This simplifies the code. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 45 +++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d4b18ce9ba88..aff7a2c08387 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3154,22 +3154,12 @@ t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) } static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +t_func_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = NULL; - if (unlikely(ftrace_disabled)) - return NULL; - - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, pos); - (*pos)++; - iter->pos = iter->func_pos = *pos; - - if (iter->flags & FTRACE_ITER_PRINTALL) - return t_hash_start(m, pos); retry: if (iter->idx >= iter->pg->index) { @@ -3192,13 +3182,40 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } if (!rec) - return t_hash_start(m, pos); + return NULL; + iter->pos = iter->func_pos = *pos; iter->func = rec; return iter; } +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *ret; + + if (unlikely(ftrace_disabled)) + return NULL; + + if (iter->flags & FTRACE_ITER_HASH) + return t_hash_next(m, pos); + + if (iter->flags & FTRACE_ITER_PRINTALL) { + /* next must increment pos, and t_hash_start does not */ + (*pos)++; + return t_hash_start(m, pos); + } + + ret = t_func_next(m, pos); + + if (!ret) + return t_hash_start(m, pos); + + return ret; +} + static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; @@ -3250,13 +3267,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) iter->pg = ftrace_pages_start; iter->idx = 0; for (l = 0; l <= *pos; ) { - p = t_next(m, p, &l); + p = t_func_next(m, &l); if (!p) break; } if (!p) - return NULL; + return t_hash_start(m, pos); return iter; } From b80f0f6c9ed3958ff4002b6135f43a1ef312a610 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Apr 2017 12:57:35 -0400 Subject: [PATCH 13/60] ftrace: Have init/main.c call ftrace directly to free init memory Relying on free_reserved_area() to call ftrace to free init memory proved to not be sufficient. The issue is that on x86, when debug_pagealloc is enabled, the init memory is not freed, but simply set as not present. Since ftrace was uninformed of this, starting function tracing still tries to update pages that are not present according to the page tables, causing ftrace to bug, as well as killing the kernel itself. Instead of relying on free_reserved_area(), have init/main.c call ftrace directly just before it frees the init memory. Then it needs to use __init_begin and __init_end to know where the init memory location is. Looking at all archs (and testing what I can), it appears that this should work for each of them. Reported-by: kernel test robot Reported-by: Fengguang Wu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 +++--- init/main.c | 1 + kernel/trace/ftrace.c | 7 ++++--- mm/page_alloc.c | 3 --- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0276a2c487e6..ef7123219f14 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -147,9 +147,9 @@ struct ftrace_ops_hash { struct mutex regex_lock; }; -void ftrace_free_mem(void *start, void *end); +void ftrace_free_init_mem(void); #else -static inline void ftrace_free_mem(void *start, void *end) { } +static inline void ftrace_free_init_mem(void) { } #endif /* @@ -266,7 +266,7 @@ static inline int ftrace_nr_registered_ops(void) } static inline void clear_ftrace_function(void) { } static inline void ftrace_kill(void) { } -static inline void ftrace_free_mem(void *start, void *end) { } +static inline void ftrace_free_init_mem(void) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_STACK_TRACER diff --git a/init/main.c b/init/main.c index c0137b916aa1..0e8849f74561 100644 --- a/init/main.c +++ b/init/main.c @@ -962,6 +962,7 @@ static int __ref kernel_init(void *unused) kernel_init_freeable(); /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); + ftrace_free_init_mem(); free_initmem(); mark_readonly(); system_state = SYSTEM_RUNNING; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index aff7a2c08387..8efd9fe7aec0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -36,6 +36,7 @@ #include +#include #include #include "trace_output.h" @@ -5279,10 +5280,10 @@ void ftrace_module_init(struct module *mod) } #endif /* CONFIG_MODULES */ -void ftrace_free_mem(void *start_ptr, void *end_ptr) +void __init ftrace_free_init_mem(void) { - unsigned long start = (unsigned long)start_ptr; - unsigned long end = (unsigned long)end_ptr; + unsigned long start = (unsigned long)(&__init_begin); + unsigned long end = (unsigned long)(&__init_end); struct ftrace_page **last_pg = &ftrace_pages_start; struct ftrace_page *pg; struct dyn_ftrace *rec; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee82bfb7cd8..178bf9c2a2cb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6606,9 +6606,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) void *pos; unsigned long pages = 0; - /* This may be .init text, inform ftrace to remove it */ - ftrace_free_mem(start, end); - start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { From 696ced4fb1d76802f864d8848aa4716633f83c17 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Mon, 3 Apr 2017 12:36:22 +0200 Subject: [PATCH 14/60] tracing/kprobes: expose maxactive for kretprobe in kprobe_events When a kretprobe is installed on a kernel function, there is a maximum limit of how many calls in parallel it can catch (aka "maxactive"). A kernel module could call register_kretprobe() and initialize maxactive (see example in samples/kprobes/kretprobe_example.c). But that is not exposed to userspace and it is currently not possible to choose maxactive when writing to /sys/kernel/debug/tracing/kprobe_events The default maxactive can be as low as 1 on single-core with a non-preemptive kernel. This is too low and we need to increase it not only for recursive functions, but for functions that sleep or resched. This patch updates the format of the command that can be written to kprobe_events so that maxactive can be optionally specified. I need this for a bpf program attached to the kretprobe of inet_csk_accept, which can sleep for a long time. This patch includes a basic selftest: > # ./ftracetest -v test.d/kprobe/ > === Ftrace unit tests === > [1] Kprobe dynamic event - adding and removing [PASS] > [2] Kprobe dynamic event - busy event check [PASS] > [3] Kprobe dynamic event with arguments [PASS] > [4] Kprobes event arguments with types [PASS] > [5] Kprobe dynamic event with function tracer [PASS] > [6] Kretprobe dynamic event with arguments [PASS] > [7] Kretprobe dynamic event with maxactive [PASS] > > # of passed: 7 > # of failed: 0 > # of unresolved: 0 > # of untested: 0 > # of unsupported: 0 > # of xfailed: 0 > # of undefined(test bug): 0 BugLink: https://github.com/iovisor/bcc/issues/1072 Link: http://lkml.kernel.org/r/1491215782-15490-1-git-send-email-alban@kinvolk.io Acked-by: Masami Hiramatsu Signed-off-by: Alban Crequy Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/kprobetrace.txt | 5 ++- kernel/trace/trace_kprobe.c | 39 ++++++++++++++++--- .../test.d/kprobe/kretprobe_maxactive.tc | 39 +++++++++++++++++++ 3 files changed, 76 insertions(+), 7 deletions(-) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 41ef9d8efe95..25f39601a7f7 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -23,7 +23,7 @@ current_tracer. Instead of that, add probe points via Synopsis of kprobe_events ------------------------- p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe - r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe + r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe -:[GRP/]EVENT : Clear a probe GRP : Group name. If omitted, use "kprobes" for it. @@ -32,6 +32,9 @@ Synopsis of kprobe_events MOD : Module name which has given SYM. SYM[+offs] : Symbol+offset where the probe is inserted. MEMADDR : Address where the probe is inserted. + MAXACTIVE : Maximum number of instances of the specified function that + can be probed simultaneously, or 0 for the default value + as defined in Documentation/kprobes.txt section 1.3.1. FETCHARGS : Arguments. Each probe can have up to 128 args. %REG : Fetch register REG diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5f688cc724f0..f01d49b576c0 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -25,6 +25,7 @@ #include "trace_probe.h" #define KPROBE_EVENT_SYSTEM "kprobes" +#define KRETPROBE_MAXACTIVE_MAX 4096 /** * Kprobe event core functions @@ -282,6 +283,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, void *addr, const char *symbol, unsigned long offs, + int maxactive, int nargs, bool is_return) { struct trace_kprobe *tk; @@ -309,6 +311,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, else tk->rp.kp.pre_handler = kprobe_dispatcher; + tk->rp.maxactive = maxactive; + if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; @@ -598,8 +602,10 @@ static int create_trace_kprobe(int argc, char **argv) { /* * Argument syntax: - * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] - * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * - Add kprobe: + * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: + * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] * Fetch args: * $retval : fetch return value * $stack : fetch stack address @@ -619,6 +625,7 @@ static int create_trace_kprobe(int argc, char **argv) int i, ret = 0; bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; + int maxactive = 0; char *arg; unsigned long offset = 0; void *addr = NULL; @@ -637,8 +644,28 @@ static int create_trace_kprobe(int argc, char **argv) return -EINVAL; } - if (argv[0][1] == ':') { - event = &argv[0][2]; + event = strchr(&argv[0][1], ':'); + if (event) { + event[0] = '\0'; + event++; + } + if (is_return && isdigit(argv[0][1])) { + ret = kstrtouint(&argv[0][1], 0, &maxactive); + if (ret) { + pr_info("Failed to parse maxactive.\n"); + return ret; + } + /* kretprobes instances are iterated over via a list. The + * maximum should stay reasonable. + */ + if (maxactive > KRETPROBE_MAXACTIVE_MAX) { + pr_info("Maxactive is too big (%d > %d).\n", + maxactive, KRETPROBE_MAXACTIVE_MAX); + return -E2BIG; + } + } + + if (event) { if (strchr(event, '/')) { group = event; event = strchr(group, '/') + 1; @@ -718,8 +745,8 @@ static int create_trace_kprobe(int argc, char **argv) is_return ? 'r' : 'p', addr); event = buf; } - tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, - is_return); + tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, + argc, is_return); if (IS_ERR(tk)) { pr_info("Failed to allocate trace_probe.(%d)\n", (int)PTR_ERR(tk)); diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc new file mode 100644 index 000000000000..57abdf1caabf --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc @@ -0,0 +1,39 @@ +#!/bin/sh +# description: Kretprobe dynamic event with maxactive + +[ -f kprobe_events ] || exit_unsupported # this is configurable + +echo > kprobe_events + +# Test if we successfully reject unknown messages +if echo 'a:myprobeaccept inet_csk_accept' > kprobe_events; then false; else true; fi + +# Test if we successfully reject too big maxactive +if echo 'r1000000:myprobeaccept inet_csk_accept' > kprobe_events; then false; else true; fi + +# Test if we successfully reject unparsable numbers for maxactive +if echo 'r10fuzz:myprobeaccept inet_csk_accept' > kprobe_events; then false; else true; fi + +# Test for kretprobe with event name without maxactive +echo 'r:myprobeaccept inet_csk_accept' > kprobe_events +grep myprobeaccept kprobe_events +test -d events/kprobes/myprobeaccept +echo '-:myprobeaccept' >> kprobe_events + +# Test for kretprobe with event name with a small maxactive +echo 'r10:myprobeaccept inet_csk_accept' > kprobe_events +grep myprobeaccept kprobe_events +test -d events/kprobes/myprobeaccept +echo '-:myprobeaccept' >> kprobe_events + +# Test for kretprobe without event name without maxactive +echo 'r inet_csk_accept' > kprobe_events +grep inet_csk_accept kprobe_events +echo > kprobe_events + +# Test for kretprobe without event name with a small maxactive +echo 'r10 inet_csk_accept' > kprobe_events +grep inet_csk_accept kprobe_events +echo > kprobe_events + +clear_trace From 0598e4f08e3da1fea2ee3b4765a44798147a8c62 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 6 Apr 2017 10:28:12 -0400 Subject: [PATCH 15/60] ftrace: Add use of synchronize_rcu_tasks() with dynamic trampolines The function tracer needs to be more careful than other subsystems when it comes to freeing data. Especially if that data is actually executable code. When a single function is traced, a trampoline can be dynamically allocated which is called to jump to the function trace callback. When the callback is no longer needed, the dynamic allocated trampoline needs to be freed. This is where the issues arise. The dynamically allocated trampoline must not be used again. As function tracing can trace all subsystems, including subsystems that are used to serialize aspects of freeing (namely RCU), it must take extra care when doing the freeing. Before synchronize_rcu_tasks() was around, there was no way for the function tracer to know that nothing was using the dynamically allocated trampoline when CONFIG_PREEMPT was enabled. That's because a task could be indefinitely preempted while sitting on the trampoline. Now with synchronize_rcu_tasks(), it will wait till all tasks have either voluntarily scheduled (not on the trampoline) or goes into userspace (not on the trampoline). Then it is safe to free the trampoline even with CONFIG_PREEMPT set. Acked-by: "Paul E. McKenney" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 3 ++- kernel/trace/ftrace.c | 42 ++++++++++++++++++------------------------ 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d4a06e714645..67b463b4f169 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -134,7 +134,8 @@ config FUNCTION_TRACER select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER - select GLOB + select GLOB + select TASKS_RCU if PREEMPT help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8efd9fe7aec0..34f63e78d661 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2808,18 +2808,28 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * callers are done before leaving this function. * The same goes for freeing the per_cpu data of the per_cpu * ops. - * - * Again, normal synchronize_sched() is not good enough. - * We need to do a hard force of sched synchronization. - * This is because we use preempt_disable() to do RCU, but - * the function tracers can be called where RCU is not watching - * (like before user_exit()). We can not rely on the RCU - * infrastructure to do the synchronization, thus we must do it - * ourselves. */ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { + /* + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ schedule_on_each_cpu(ftrace_sync); + /* + * When the kernel is preeptive, tasks can be preempted + * while on a ftrace trampoline. Just scheduling a task on + * a CPU is not good enough to flush them. Calling + * synchornize_rcu_tasks() will wait for those tasks to + * execute and either schedule voluntarily or enter user space. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_tasks(); + arch_ftrace_trampoline_free(ops); if (ops->flags & FTRACE_OPS_FL_PER_CPU) @@ -5366,22 +5376,6 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) static void ftrace_update_trampoline(struct ftrace_ops *ops) { - -/* - * Currently there's no safe way to free a trampoline when the kernel - * is configured with PREEMPT. That is because a task could be preempted - * when it jumped to the trampoline, it may be preempted for a long time - * depending on the system load, and currently there's no way to know - * when it will be off the trampoline. If the trampoline is freed - * too early, when the task runs again, it will be executing on freed - * memory and crash. - */ -#ifdef CONFIG_PREEMPT - /* Currently, only non dynamic ops can have a trampoline */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - return; -#endif - arch_ftrace_update_trampoline(ops); } From 252babcd52aabe37aaad03685e7d6ad454edb9f9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 6 Apr 2017 12:11:36 -0400 Subject: [PATCH 16/60] tracing: Replace the per_cpu() with __this_cpu*() in trace_stack.c The updates to the trace_active per cpu variable can be updated with the __this_cpu_*() functions as it only gets updated on the CPU that the variable is on. Thanks to Paul McKenney for suggesting __this_cpu_* instead of this_cpu_*. Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5fb1f2c87e6b..338d076a06da 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -207,13 +207,12 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { unsigned long stack; - int cpu; preempt_disable_notrace(); - cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ - if (per_cpu(trace_active, cpu)++ != 0) + __this_cpu_inc(trace_active); + if (__this_cpu_read(trace_active) != 1) goto out; ip += MCOUNT_INSN_SIZE; @@ -221,7 +220,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, check_stack(ip, &stack); out: - per_cpu(trace_active, cpu)--; + __this_cpu_dec(trace_active); /* prevent recursion in schedule */ preempt_enable_notrace(); } @@ -253,7 +252,6 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, long *ptr = filp->private_data; unsigned long val, flags; int ret; - int cpu; ret = kstrtoul_from_user(ubuf, count, 10, &val); if (ret) @@ -266,14 +264,13 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, * we will cause circular lock, so we also need to increase * the percpu trace_active here. */ - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)++; + __this_cpu_inc(trace_active); arch_spin_lock(&stack_trace_max_lock); *ptr = val; arch_spin_unlock(&stack_trace_max_lock); - per_cpu(trace_active, cpu)--; + __this_cpu_dec(trace_active); local_irq_restore(flags); return count; @@ -307,12 +304,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - int cpu; - local_irq_disable(); - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)++; + __this_cpu_inc(trace_active); arch_spin_lock(&stack_trace_max_lock); @@ -324,12 +318,9 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { - int cpu; - arch_spin_unlock(&stack_trace_max_lock); - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)--; + __this_cpu_dec(trace_active); local_irq_enable(); } From 5367278cb7ba74537bcad1470d75f30d95b09c14 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 6 Apr 2017 12:26:20 -0400 Subject: [PATCH 17/60] tracing: Add stack_tracer_disable/enable() functions There are certain parts of the kernel that cannot let stack tracing proceed (namely in RCU), because the stack tracer uses RCU, and parts of RCU internals cannot handle having RCU read side locks taken. Add stack_tracer_disable() and stack_tracer_enable() functions to let RCU stop stack tracing on the current CPU when it is in those critical sections. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ++++++ kernel/trace/trace_stack.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ef7123219f14..7b4e6572ab21 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -286,6 +286,12 @@ int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); + +void stack_tracer_disable(void); +void stack_tracer_enable(void); +#else +static inline void stack_tracer_disable(void) { } +static inline void stack_tracer_enable(void) { } #endif struct ftrace_func_command { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 338d076a06da..21e536cf66e4 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -41,6 +41,38 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; +/** + * stack_tracer_disable - temporarily disable the stack tracer + * + * There's a few locations (namely in RCU) where stack tracing + * cannot be executed. This function is used to disable stack + * tracing during those critical sections. + * + * This function must be called with preemption or interrupts + * disabled and stack_tracer_enable() must be called shortly after + * while preemption or interrupts are still disabled. + */ +void stack_tracer_disable(void) +{ + /* Preemption or interupts must be disabled */ + if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) + WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); + this_cpu_inc(trace_active); +} + +/** + * stack_tracer_enable - re-enable the stack tracer + * + * After stack_tracer_disable() is called, stack_tracer_enable() + * must be called shortly afterward. + */ +void stack_tracer_enable(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) + WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); + this_cpu_dec(trace_active); +} + void stack_trace_print(void) { long i; From 8aaf1ee70e19ac74cbbb81098edfa328d1ab4bd7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 6 Apr 2017 15:47:32 -0400 Subject: [PATCH 18/60] tracing: Rename trace_active to disable_stack_tracer and inline its modification In order to eliminate a function call, make "trace_active" into "disable_stack_tracer" and convert stack_tracer_disable() and friends into static inline functions. Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 36 +++++++++++++++++++++++++-- kernel/trace/trace_stack.c | 50 +++++++------------------------------- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7b4e6572ab21..06b2990a35e4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -287,8 +287,40 @@ stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -void stack_tracer_disable(void); -void stack_tracer_enable(void); +/* DO NOT MODIFY THIS VARIABLE DIRECTLY! */ +DECLARE_PER_CPU(int, disable_stack_tracer); + +/** + * stack_tracer_disable - temporarily disable the stack tracer + * + * There's a few locations (namely in RCU) where stack tracing + * cannot be executed. This function is used to disable stack + * tracing during those critical sections. + * + * This function must be called with preemption or interrupts + * disabled and stack_tracer_enable() must be called shortly after + * while preemption or interrupts are still disabled. + */ +static inline void stack_tracer_disable(void) +{ + /* Preemption or interupts must be disabled */ + if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) + WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); + this_cpu_inc(disable_stack_tracer); +} + +/** + * stack_tracer_enable - re-enable the stack tracer + * + * After stack_tracer_disable() is called, stack_tracer_enable() + * must be called shortly afterward. + */ +static inline void stack_tracer_enable(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) + WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); + this_cpu_dec(disable_stack_tracer); +} #else static inline void stack_tracer_disable(void) { } static inline void stack_tracer_enable(void) { } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 21e536cf66e4..f2f02ff350d4 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -35,44 +35,12 @@ unsigned long stack_trace_max_size; arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -static DEFINE_PER_CPU(int, trace_active); +DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; -/** - * stack_tracer_disable - temporarily disable the stack tracer - * - * There's a few locations (namely in RCU) where stack tracing - * cannot be executed. This function is used to disable stack - * tracing during those critical sections. - * - * This function must be called with preemption or interrupts - * disabled and stack_tracer_enable() must be called shortly after - * while preemption or interrupts are still disabled. - */ -void stack_tracer_disable(void) -{ - /* Preemption or interupts must be disabled */ - if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) - WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); - this_cpu_inc(trace_active); -} - -/** - * stack_tracer_enable - re-enable the stack tracer - * - * After stack_tracer_disable() is called, stack_tracer_enable() - * must be called shortly afterward. - */ -void stack_tracer_enable(void) -{ - if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) - WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); - this_cpu_dec(trace_active); -} - void stack_trace_print(void) { long i; @@ -243,8 +211,8 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); /* no atomic needed, we only modify this variable by this cpu */ - __this_cpu_inc(trace_active); - if (__this_cpu_read(trace_active) != 1) + __this_cpu_inc(disable_stack_tracer); + if (__this_cpu_read(disable_stack_tracer) != 1) goto out; ip += MCOUNT_INSN_SIZE; @@ -252,7 +220,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, check_stack(ip, &stack); out: - __this_cpu_dec(trace_active); + __this_cpu_dec(disable_stack_tracer); /* prevent recursion in schedule */ preempt_enable_notrace(); } @@ -294,15 +262,15 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, /* * In case we trace inside arch_spin_lock() or after (NMI), * we will cause circular lock, so we also need to increase - * the percpu trace_active here. + * the percpu disable_stack_tracer here. */ - __this_cpu_inc(trace_active); + __this_cpu_inc(disable_stack_tracer); arch_spin_lock(&stack_trace_max_lock); *ptr = val; arch_spin_unlock(&stack_trace_max_lock); - __this_cpu_dec(trace_active); + __this_cpu_dec(disable_stack_tracer); local_irq_restore(flags); return count; @@ -338,7 +306,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) { local_irq_disable(); - __this_cpu_inc(trace_active); + __this_cpu_inc(disable_stack_tracer); arch_spin_lock(&stack_trace_max_lock); @@ -352,7 +320,7 @@ static void t_stop(struct seq_file *m, void *p) { arch_spin_unlock(&stack_trace_max_lock); - __this_cpu_dec(trace_active); + __this_cpu_dec(disable_stack_tracer); local_irq_enable(); } From a278d4718988a253a08d12e6d76c80979693d39e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Apr 2017 09:05:18 -0700 Subject: [PATCH 19/60] rcu: Fix dyntick-idle tracing The tracing subsystem started using rcu_irq_entry() and rcu_irq_exit() (with my blessing) to allow the current _rcuidle alternative tracepoint name to be dispensed with while still maintaining good performance. Unfortunately, this causes RCU's dyntick-idle entry code's tracing to appear to RCU like an interrupt that occurs where RCU is not designed to handle interrupts. This commit fixes this problem by moving the zeroing of ->dynticks_nesting after the offending trace_rcu_dyntick() statement, which narrows the window of vulnerability to a pair of adjacent statements that are now marked with comments to that effect. Link: http://lkml.kernel.org/r/20170405093207.404f8deb@gandalf.local.home Link: http://lkml.kernel.org/r/20170405193928.GM1600@linux.vnet.ibm.com Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 48 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 50fee7689e71..8b4d273331e4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "tree.h" #include "rcu.h" @@ -771,25 +772,24 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state + * rcu_eqs_enter_common - current CPU is entering an extended quiescent state * - * If the new value of the ->dynticks_nesting counter now is zero, - * we really have entered idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. + * Enter idle, doing appropriate accounting. The caller must have + * disabled interrupts. */ -static void rcu_eqs_enter_common(long long oldval, bool user) +static void rcu_eqs_enter_common(bool user) { struct rcu_state *rsp; struct rcu_data *rdp; - RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); + trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0); rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -800,7 +800,10 @@ static void rcu_eqs_enter_common(long long oldval, bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - rcu_dynticks_eqs_enter(); + stack_tracer_disable(); + rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ + rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ + stack_tracer_enable(); rcu_dynticks_task_enter(); /* @@ -821,19 +824,15 @@ static void rcu_eqs_enter_common(long long oldval, bool user) */ static void rcu_eqs_enter(bool user) { - long long oldval; struct rcu_dynticks *rdtp; rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (oldval & DYNTICK_TASK_NEST_MASK) == 0); - if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { - rdtp->dynticks_nesting = 0; - rcu_eqs_enter_common(oldval, user); - } else { + (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); + if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + rcu_eqs_enter_common(user); + else rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - } } /** @@ -892,19 +891,18 @@ void rcu_user_enter(void) */ void rcu_irq_exit(void) { - long long oldval; struct rcu_dynticks *rdtp; RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting--; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting < 0); - if (rdtp->dynticks_nesting) - trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); - else - rcu_eqs_enter_common(oldval, true); + rdtp->dynticks_nesting < 1); + if (rdtp->dynticks_nesting <= 1) { + rcu_eqs_enter_common(true); + } else { + trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); + rdtp->dynticks_nesting--; + } rcu_sysidle_enter(1); } From 03ecd3f48e57f2e6154584e0ee7450d7a05e2d3b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 7 Apr 2017 12:20:36 -0400 Subject: [PATCH 20/60] rcu/tracing: Add rcu_disabled to denote when rcu_irq_enter() will not work Tracing uses rcu_irq_enter() as a way to make sure that RCU is watching when it needs to use rcu_read_lock() and friends. This is because tracing can happen as RCU is about to enter user space, or about to go idle, and RCU does not watch for RCU read side critical sections as it makes the transition. There is a small location within the RCU infrastructure that rcu_irq_enter() itself will not work. If tracing were to occur in that section it will break if it tries to use rcu_irq_enter(). Originally, this happens with the stack_tracer, because it will call save_stack_trace when it encounters stack usage that is greater than any stack usage it had encountered previously. There was a case where that happened in the RCU section where rcu_irq_enter() did not work, and lockdep complained loudly about it. To fix it, stack tracing added a call to be disabled and RCU would disable stack tracing during the critical section that rcu_irq_enter() was inoperable. This solution worked, but there are other cases that use rcu_irq_enter() and it would be a good idea to let RCU give a way to let others know that rcu_irq_enter() will not work. For example, in trace events. Another helpful aspect of this change is that it also moves the per cpu variable called in the RCU critical section into a cache locale along with other RCU per cpu variables used in that same location. I'm keeping the stack_trace_disable() code, as that still could be used in the future by places that really need to disable it. And since it's only a static inline, it wont take up any kernel text if it is not used. Link: http://lkml.kernel.org/r/20170405093207.404f8deb@gandalf.local.home Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- include/linux/rcupdate.h | 5 +++++ kernel/rcu/tree.c | 18 ++++++++++++++++-- kernel/trace/trace_stack.c | 8 ++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de88b33c0974..dea8f17b2fe3 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -97,6 +97,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long secs, unsigned long c_old, unsigned long c); +bool rcu_irq_enter_disabled(void); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, @@ -113,6 +114,10 @@ static inline void rcutorture_record_test_transition(void) static inline void rcutorture_record_progress(unsigned long vernum) { } +static inline bool rcu_irq_enter_disabled(void) +{ + return false; +} #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8b4d273331e4..a6dcf3bd244f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -284,6 +284,20 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +/* + * There's a few places, currently just in the tracing infrastructure, + * that uses rcu_irq_enter() to make sure RCU is watching. But there's + * a small location where that will not even work. In those cases + * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter() + * can be called. + */ +static DEFINE_PER_CPU(bool, disable_rcu_irq_enter); + +bool rcu_irq_enter_disabled(void) +{ + return this_cpu_read(disable_rcu_irq_enter); +} + /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state. @@ -800,10 +814,10 @@ static void rcu_eqs_enter_common(bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - stack_tracer_disable(); + __this_cpu_inc(disable_rcu_irq_enter); rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ - stack_tracer_enable(); + __this_cpu_dec(disable_rcu_irq_enter); rcu_dynticks_task_enter(); /* diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index f2f02ff350d4..76aa04d4c925 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -96,6 +96,14 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; + /* + * There's a slight chance that we are tracing inside the + * RCU infrastructure, and rcu_irq_enter() will not work + * as expected. + */ + if (unlikely(rcu_irq_enter_disabled())) + return; + local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); From d54b6eeb553c89ed8d4c5a2ed73df374a45b9562 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 7 Apr 2017 12:40:49 -0400 Subject: [PATCH 21/60] tracing: Make sure rcu_irq_enter() can work for trace_*_rcuidle() trace events Stack tracing discovered that there's a small location inside the RCU infrastructure where calling rcu_irq_enter() does not work. As trace events use rcu_irq_enter() it must make sure that it is functionable. A check against rcu_irq_enter_disabled() is added with a WARN_ON_ONCE() as no trace event should ever be used in that part of RCU. If the warning is triggered, then the trace event is ignored. Restructure the __DO_TRACE() a bit to get rid of the prercu and postrcu, and just have an rcucheck that does the work from within the _DO_TRACE() macro. gcc optimization will compile out the rcucheck=0 case. Link: http://lkml.kernel.org/r/20170405093207.404f8deb@gandalf.local.home Acked-by: Mathieu Desnoyers Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index f72fcfe0e66a..cc48cb2ce209 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -128,7 +128,7 @@ extern void syscall_unregfunc(void); * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto". */ -#define __DO_TRACE(tp, proto, args, cond, prercu, postrcu) \ +#define __DO_TRACE(tp, proto, args, cond, rcucheck) \ do { \ struct tracepoint_func *it_func_ptr; \ void *it_func; \ @@ -136,7 +136,11 @@ extern void syscall_unregfunc(void); \ if (!(cond)) \ return; \ - prercu; \ + if (rcucheck) { \ + if (WARN_ON_ONCE(rcu_irq_enter_disabled())) \ + return; \ + rcu_irq_enter_irqson(); \ + } \ rcu_read_lock_sched_notrace(); \ it_func_ptr = rcu_dereference_sched((tp)->funcs); \ if (it_func_ptr) { \ @@ -147,20 +151,19 @@ extern void syscall_unregfunc(void); } while ((++it_func_ptr)->func); \ } \ rcu_read_unlock_sched_notrace(); \ - postrcu; \ + if (rcucheck) \ + rcu_irq_exit_irqson(); \ } while (0) #ifndef MODULE -#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) \ +#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) \ static inline void trace_##name##_rcuidle(proto) \ { \ if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ - TP_CONDITION(cond), \ - rcu_irq_enter_irqson(), \ - rcu_irq_exit_irqson()); \ + TP_CONDITION(cond), 1); \ } #else #define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) @@ -186,7 +189,7 @@ extern void syscall_unregfunc(void); __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ - TP_CONDITION(cond),,); \ + TP_CONDITION(cond), 0); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ rcu_read_lock_sched_notrace(); \ rcu_dereference_sched(__tracepoint_##name.funcs);\ From acceb72e90624ec6c3f2fc62e72dab892cd075da Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 14 Apr 2017 17:45:45 -0400 Subject: [PATCH 22/60] ftrace: Fix removing of second function probe When two function probes are added to set_ftrace_filter, and then one of them is removed, the update to the function locations is not performed, and the record keeping of the function states are corrupted, and causes an ftrace_bug() to occur. This is easily reproducable by adding two probes, removing one, and then adding it back again. # cd /sys/kernel/debug/tracing # echo schedule:traceoff > set_ftrace_filter # echo do_IRQ:traceoff > set_ftrace_filter # echo \!do_IRQ:traceoff > /debug/tracing/set_ftrace_filter # echo do_IRQ:traceoff > set_ftrace_filter Causes: ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1098 at kernel/trace/ftrace.c:2369 ftrace_get_addr_curr+0x143/0x220 Modules linked in: [...] CPU: 2 PID: 1098 Comm: bash Not tainted 4.10.0-test+ #405 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v02.05 05/07/2012 Call Trace: dump_stack+0x68/0x9f __warn+0x111/0x130 ? trace_irq_work_interrupt+0xa0/0xa0 warn_slowpath_null+0x1d/0x20 ftrace_get_addr_curr+0x143/0x220 ? __fentry__+0x10/0x10 ftrace_replace_code+0xe3/0x4f0 ? ftrace_int3_handler+0x90/0x90 ? printk+0x99/0xb5 ? 0xffffffff81000000 ftrace_modify_all_code+0x97/0x110 arch_ftrace_update_code+0x10/0x20 ftrace_run_update_code+0x1c/0x60 ftrace_run_modify_code.isra.48.constprop.62+0x8e/0xd0 register_ftrace_function_probe+0x4b6/0x590 ? ftrace_startup+0x310/0x310 ? debug_lockdep_rcu_enabled.part.4+0x1a/0x30 ? update_stack_state+0x88/0x110 ? ftrace_regex_write.isra.43.part.44+0x1d3/0x320 ? preempt_count_sub+0x18/0xd0 ? mutex_lock_nested+0x104/0x800 ? ftrace_regex_write.isra.43.part.44+0x1d3/0x320 ? __unwind_start+0x1c0/0x1c0 ? _mutex_lock_nest_lock+0x800/0x800 ftrace_trace_probe_callback.isra.3+0xc0/0x130 ? func_set_flag+0xe0/0xe0 ? __lock_acquire+0x642/0x1790 ? __might_fault+0x1e/0x20 ? trace_get_user+0x398/0x470 ? strcmp+0x35/0x60 ftrace_trace_onoff_callback+0x48/0x70 ftrace_regex_write.isra.43.part.44+0x251/0x320 ? match_records+0x420/0x420 ftrace_filter_write+0x2b/0x30 __vfs_write+0xd7/0x330 ? do_loop_readv_writev+0x120/0x120 ? locks_remove_posix+0x90/0x2f0 ? do_lock_file_wait+0x160/0x160 ? __lock_is_held+0x93/0x100 ? rcu_read_lock_sched_held+0x5c/0xb0 ? preempt_count_sub+0x18/0xd0 ? __sb_start_write+0x10a/0x230 ? vfs_write+0x222/0x240 vfs_write+0xef/0x240 SyS_write+0xab/0x130 ? SyS_read+0x130/0x130 ? trace_hardirqs_on_caller+0x182/0x280 ? trace_hardirqs_on_thunk+0x1a/0x1c entry_SYSCALL_64_fastpath+0x18/0xad RIP: 0033:0x7fe61c157c30 RSP: 002b:00007ffe87890258 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: ffffffff8114a410 RCX: 00007fe61c157c30 RDX: 0000000000000010 RSI: 000055814798f5e0 RDI: 0000000000000001 RBP: ffff8800c9027f98 R08: 00007fe61c422740 R09: 00007fe61ca53700 R10: 0000000000000073 R11: 0000000000000246 R12: 0000558147a36400 R13: 00007ffe8788f160 R14: 0000000000000024 R15: 00007ffe8788f15c ? trace_hardirqs_off_caller+0xc0/0x110 ---[ end trace 99fa09b3d9869c2c ]--- Bad trampoline accounting at: ffffffff81cc3b00 (do_IRQ+0x0/0x150) Cc: stable@vger.kernel.org Fixes: 59df055f1991 ("ftrace: trace different functions with a different tracer") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 34f63e78d661..4b6459a57fbc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3780,23 +3780,24 @@ static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) ftrace_probe_registered = 1; } -static void __disable_ftrace_function_probe(void) +static bool __disable_ftrace_function_probe(void) { int i; if (!ftrace_probe_registered) - return; + return false; for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; if (hhd->first) - return; + return false; } /* no more funcs left */ ftrace_shutdown(&trace_probe_ops, 0); ftrace_probe_registered = 0; + return true; } @@ -3926,6 +3927,7 @@ static void __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data, int flags) { + struct ftrace_ops_hash old_hash_ops; struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; struct ftrace_func_probe *p; @@ -3937,6 +3939,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; int i, ret; + bool disabled; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) func_g.search = NULL; @@ -3955,6 +3958,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_lock(&trace_probe_ops.func_hash->regex_lock); + old_hash_ops.filter_hash = old_hash; + /* Probes only have filters */ + old_hash_ops.notrace_hash = NULL; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) /* Hmm, should report this somehow */ @@ -3992,12 +3999,17 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } } mutex_lock(&ftrace_lock); - __disable_ftrace_function_probe(); + disabled = __disable_ftrace_function_probe(); /* * Remove after the disable is called. Otherwise, if the last * probe is removed, a null hash means *all enabled*. */ ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + /* still need to update the function call sites */ + if (ftrace_enabled && !disabled) + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + &old_hash_ops); synchronize_sched(); if (!ret) free_ftrace_hash_rcu(old_hash); From fcdc71257923263d042236eaf62bae5e033757b5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 17 Apr 2017 10:22:29 -0400 Subject: [PATCH 23/60] ftrace: Fix indexing of t_hash_start() from t_next() t_hash_start() does not increment *pos, where as t_next() must. But when t_next() does increment *pos, it must still pass in the original *pos to t_hash_start() otherwise it will skip the first instance: # cd /sys/kernel/debug/tracing # echo schedule:traceoff > set_ftrace_filter # echo do_IRQ:traceoff > set_ftrace_filter # echo call_rcu > set_ftrace_filter # cat set_ftrace_filter call_rcu schedule:traceoff:unlimited do_IRQ:traceoff:unlimited The above called t_hash_start() from t_start() as there was only one function (call_rcu), but if we add another function: # echo xfrm_policy_destroy_rcu >> set_ftrace_filter # cat set_ftrace_filter call_rcu xfrm_policy_destroy_rcu do_IRQ:traceoff:unlimited The "schedule:traceoff" disappears. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4b6459a57fbc..b21a3e61ac74 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3205,6 +3205,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; + loff_t l = *pos; /* t_hash_start() must use original pos */ void *ret; if (unlikely(ftrace_disabled)) @@ -3216,13 +3217,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (iter->flags & FTRACE_ITER_PRINTALL) { /* next must increment pos, and t_hash_start does not */ (*pos)++; - return t_hash_start(m, pos); + return t_hash_start(m, &l); } ret = t_func_next(m, pos); if (!ret) - return t_hash_start(m, pos); + return t_hash_start(m, &l); return ret; } From b980b117c9ff17226937128a15692a18c9a28ed6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 11 Apr 2017 18:25:08 -0400 Subject: [PATCH 24/60] tracing: Have the trace_event benchmark thread call cond_resched_rcu_qs() The trace_event benchmark thread runs in kernel space in an infinite loop while also calling cond_resched() in case anything else wants to schedule in. Unfortunately, on a PREEMPT kernel, that makes it a nop, in which case, this will never voluntarily schedule. That will cause synchronize_rcu_tasks() to forever block on this thread, while it is running. This is exactly what cond_resched_rcu_qs() is for. Use that instead. Acked-by: "Paul E. McKenney" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_benchmark.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index e49fbe901cfc..16a8cf02eee9 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -153,10 +153,18 @@ static int benchmark_event_kthread(void *arg) trace_do_benchmark(); /* - * We don't go to sleep, but let others - * run as well. + * We don't go to sleep, but let others run as well. + * This is bascially a "yield()" to let any task that + * wants to run, schedule in, but if the CPU is idle, + * we'll keep burning cycles. + * + * Note the _rcu_qs() version of cond_resched() will + * notify synchronize_rcu_tasks() that this thread has + * passed a quiescent state for rcu_tasks. Otherwise + * this thread will never voluntarily schedule which would + * block synchronize_rcu_tasks() indefinitely. */ - cond_resched(); + cond_resched_rcu_qs(); } return 0; From 1e10486ffee0a5b060c58b9c8c712422f7b88b3b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 17 Apr 2017 11:44:28 +0900 Subject: [PATCH 25/60] ftrace: Add 'function-fork' trace option The function-fork option is same as event-fork that it tracks task fork/exit and set the pid filter properly. This can be useful if user wants to trace selected tasks including their children only. Link: http://lkml.kernel.org/r/20170417024430.21194-3-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace.c | 5 ++++- kernel/trace/trace.h | 6 +++++- 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b21a3e61ac74..b5ce7ea67e02 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5600,6 +5600,43 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, trace_ignore_this_task(pid_list, next)); } +static void +ftrace_pid_follow_sched_process_fork(void *data, + struct task_struct *self, + struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->function_pids); + trace_filter_add_remove_task(pid_list, self, task); +} + +static void +ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->function_pids); + trace_filter_add_remove_task(pid_list, NULL, task); +} + +void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) +{ + if (enable) { + register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, + tr); + register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + tr); + } else { + unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, + tr); + unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + tr); + } +} + static void clear_ftrace_pids(struct trace_array *tr) { struct trace_pid_list *pid_list; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b5d4b80f2d45..8a5064a03ddf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -257,7 +257,7 @@ unsigned long long ns2usecs(u64 nsec) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - TRACE_ITER_EVENT_FORK + (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK) /* * The global_trace is the descriptor that holds the top-level tracing @@ -4205,6 +4205,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_EVENT_FORK) trace_event_follow_fork(tr, enabled); + if (mask == TRACE_ITER_FUNC_FORK) + ftrace_pid_follow_fork(tr, enabled); + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 571acee52a32..31a4997b67c6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -897,6 +897,7 @@ void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d_tracer); int init_function_trace(void); +void ftrace_pid_follow_fork(struct trace_array *tr, bool enable); #else static inline int ftrace_trace_task(struct trace_array *tr) { @@ -916,6 +917,7 @@ static inline void ftrace_reset_array_ops(struct trace_array *tr) { } static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } static inline int init_function_trace(void) { return 0; } +static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ @@ -989,11 +991,13 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, #ifdef CONFIG_FUNCTION_TRACER # define FUNCTION_FLAGS \ - C(FUNCTION, "function-trace"), + C(FUNCTION, "function-trace"), \ + C(FUNC_FORK, "function-fork"), # define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION #else # define FUNCTION_FLAGS # define FUNCTION_DEFAULT_FLAGS 0UL +# define TRACE_ITER_FUNC_FORK 0UL #endif #ifdef CONFIG_STACKTRACE From 560642d9ab5c37451ee43989dd9a35037ab5ab8d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 17 Apr 2017 11:44:29 +0900 Subject: [PATCH 26/60] selftests: ftrace: Add -l/--logdir option In my virtual machine setup, running ftracetest failed on creating LOG_DIR on a read-only filesystem. It'd be convenient to provide an option to specify a different directory as log directory. Link: http://lkml.kernel.org/r/20170417024430.21194-4-namhyung@kernel.org Cc: Ingo Molnar Acked-by: Masami Hiramatsu Cc: Shuah Khan Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/ftrace/ftracetest | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 52e3c4df28d6..a8631d978725 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -16,6 +16,7 @@ echo " -k|--keep Keep passed test logs" echo " -v|--verbose Increase verbosity of test messages" echo " -vv Alias of -v -v (Show all results in stdout)" echo " -d|--debug Debug mode (trace all shell commands)" +echo " -l|--logdir Save logs on the " exit $1 } @@ -64,6 +65,10 @@ parse_opts() { # opts DEBUG=1 shift 1 ;; + --logdir|-l) + LOG_DIR=$2 + shift 2 + ;; *.tc) if [ -f "$1" ]; then OPT_TEST_CASES="$OPT_TEST_CASES `abspath $1`" From 8e5e19c1b98394c2cdc186a4b0be422fe6e7217a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 17 Apr 2017 17:30:08 -0400 Subject: [PATCH 27/60] selftests: ftrace: Add a way to reset triggers in the set_ftrace_filter file Just writing into the set_ftrace_filter file does not reset triggers, even though it can reset the function list. Triggers require writing the trigger name with a "!" prepended. It's worse that it requires the number if the trigger has a count associated to it. Add a reset_ftrace_filter function to the ftrace self tests to allow for the tests to have a generic way to clear them. It also resets any functions that are listed in that file as well. Signed-off-by: Steven Rostedt (VMware) --- .../testing/selftests/ftrace/test.d/functions | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index 91de1a8e4f19..9aec6fcb7729 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -30,6 +30,27 @@ reset_events_filter() { # reset all current setting filters done } +reset_ftrace_filter() { # reset all triggers in set_ftrace_filter + echo > set_ftrace_filter + grep -v '^#' set_ftrace_filter | while read t; do + tr=`echo $t | cut -d: -f2` + if [ "$tr" == "" ]; then + continue + fi + if [ $tr == "enable_event" -o $tr == "disable_event" ]; then + tr=`echo $t | cut -d: -f1-4` + limit=`echo $t | cut -d: -f5` + else + tr=`echo $t | cut -d: -f1-2` + limit=`echo $t | cut -d: -f3` + fi + if [ "$limit" != "unlimited" ]; then + tr="$tr:$limit" + fi + echo "!$tr" > set_ftrace_filter + done +} + disable_events() { echo 0 > events/enable } From 43bb45da82f94fffd5fb816e1fd2453561747c8b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 17 Apr 2017 17:33:50 -0400 Subject: [PATCH 28/60] selftests: ftrace: Add a selftest to test event enable/disable func trigger This adds a test to enable and disable trace events via the function triggers. It tests enabling and disabling the sched:sched_switch event via the the event_enable and event_disable function triggers attached to the schedule() kernel function. The test does the following: o disable all events o disables or enables the sched_switch event o writes schedule:event_enable/disable:sched:sched_switch into set_ftrace_filter o 5 times it checks to make sure: . Writes 0/1 into the sched_switch/enable . Checks that the sched_switch/enable goes back to 1/0 o Resets the events o writes schedule:event_enable/disable:sched:sched_switch:3 into set_ftrace_filter o Does a loop of 3 to see that sched_switch/enable file gets updated o Makes sure the sched_switch/enable stops getting updated Signed-off-by: Steven Rostedt (VMware) --- .../test.d/ftrace/func_event_triggers.tc | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc new file mode 100644 index 000000000000..5c60afca24a6 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc @@ -0,0 +1,113 @@ +#!/bin/sh +# description: ftrace - test for function event triggers +# +# Ftrace allows to add triggers to functions, such as enabling or disabling +# tracing, enabling or disabling trace events, or recording a stack trace +# within the ring buffer. +# +# This test is designed to test event triggers +# + +# The triggers are set within the set_ftrace_filter file +if [ ! -f set_ftrace_filter ]; then + echo "set_ftrace_filter not found? Is dynamic ftrace not set?" + exit_unsupported +fi + +do_reset() { + reset_ftrace_filter + reset_tracer + disable_events + clear_trace + enable_tracing +} + +fail() { # mesg + do_reset + echo $1 + exit $FAIL +} + +SLEEP_TIME=".1" + +do_reset + +echo "Testing function probes with events:" + +EVENT="sched:sched_switch" +EVENT_ENABLE="events/sched/sched_switch/enable" + +cnt_trace() { + grep -v '^#' trace | wc -l +} + +test_event_enabled() { + val=$1 + + e=`cat $EVENT_ENABLE` + if [ "$e" != $val ]; then + echo "Expected $val but found $e" + exit -1 + fi +} + +run_enable_disable() { + enable=$1 # enable + Enable=$2 # Enable + check_disable=$3 # 0 + check_enable_star=$4 # 1* + check_disable_star=$5 # 0* + + cnt=`cnt_trace` + if [ $cnt -ne 0 ]; then + fail "Found junk in trace file" + fi + + echo "$Enable event all the time" + + echo $check_disable > $EVENT_ENABLE + sleep $SLEEP_TIME + + test_event_enabled $check_disable + + echo "schedule:${enable}_event:$EVENT" > set_ftrace_filter + + echo " make sure it works 5 times" + + for i in `seq 5`; do + sleep $SLEEP_TIME + echo " test $i" + test_event_enabled $check_enable_star + + echo $check_disable > $EVENT_ENABLE + done + sleep $SLEEP_TIME + echo " make sure it's still works" + test_event_enabled $check_enable_star + + reset_ftrace_filter + + echo " make sure it only works 3 times" + + echo $check_disable > $EVENT_ENABLE + sleep $SLEEP_TIME + + echo "schedule:${enable}_event:$EVENT:3" > set_ftrace_filter + + for i in `seq 3`; do + sleep $SLEEP_TIME + echo " test $i" + test_event_enabled $check_enable_star + + echo $check_disable > $EVENT_ENABLE + done + + sleep $SLEEP_TIME + echo " make sure it stop working" + test_event_enabled $check_disable_star + + do_reset +} + +run_enable_disable enable Enable 0 "1*" "0*" +run_enable_disable disable Disable 1 "0*" "1*" From d8b39e1d983efeb6c5835bcdd477e9c7cf1110fd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 17 Apr 2017 18:05:53 -0400 Subject: [PATCH 29/60] selftests: ftrace: Add a test to test function triggers to start and stop tracing This adds a test to test the function tiggers traceon and traceoff to make sure that it starts and stops tracing when a function is hit. The test performs the following: o Enables all events o Writes schedule:traceoff into set_ftrace_filter o Makes sure the tigger exists in the file o Makes sure the trace file no longer grows o Makes sure that tracing_on is now zero o Clears the trace file o Makes sure it's still empty o Removes the trigger o Makes sure tracing is still off (tracing_on is zero) o Writes schedule:traceon into set_ftrace_filter o Makes sure the trace file is no longer empty o Makes sure that tracing_on file is set to one o Removes the trigger o Makes sure the trigger is no longer there o Writes schedule:traceoff:3 into set_ftrace_filter o Makes sure that tracing_on turns off . Writes 1 into tracing_on . Makes sure that it turns off 2 more times o Writes 1 into tracing_on o Makes sure that tracing_on is still a one Signed-off-by: Steven Rostedt (VMware) --- .../test.d/ftrace/func_traceonoff_triggers.tc | 171 ++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc new file mode 100644 index 000000000000..3c60ca61fee3 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc @@ -0,0 +1,171 @@ +#!/bin/sh +# description: ftrace - test for function traceon/off triggers +# +# Ftrace allows to add triggers to functions, such as enabling or disabling +# tracing, enabling or disabling trace events, or recording a stack trace +# within the ring buffer. +# +# This test is designed to test enabling and disabling tracing triggers +# + +# The triggers are set within the set_ftrace_filter file +if [ ! -f set_ftrace_filter ]; then + echo "set_ftrace_filter not found? Is dynamic ftrace not set?" + exit_unsupported +fi + +do_reset() { + reset_ftrace_filter + reset_tracer + disable_events + clear_trace + enable_tracing +} + +fail() { # mesg + do_reset + echo $1 + exit $FAIL +} + +SLEEP_TIME=".1" + +do_reset + +echo "Testing function probes with enabling disabling tracing:" + +cnt_trace() { + grep -v '^#' trace | wc -l +} + +echo '** DISABLE TRACING' +disable_tracing +clear_trace + +cnt=`cnt_trace` +if [ $cnt -ne 0 ]; then + fail "Found junk in trace" +fi + + +echo '** ENABLE EVENTS' + +echo 1 > events/enable + +echo '** ENABLE TRACING' +enable_tracing + +cnt=`cnt_trace` +if [ $cnt -eq 0 ]; then + fail "Nothing found in trace" +fi + +# powerpc uses .schedule +func="schedule" +x=`grep '^\.schedule$' available_filter_functions | wc -l` +if [ "$x" -eq 1 ]; then + func=".schedule" +fi + +echo '** SET TRACEOFF' + +echo "$func:traceoff" > set_ftrace_filter + +cnt=`grep schedule set_ftrace_filter | wc -l` +if [ $cnt -ne 1 ]; then + fail "Did not find traceoff trigger" +fi + +cnt=`cnt_trace` +sleep $SLEEP_TIME +cnt2=`cnt_trace` + +if [ $cnt -ne $cnt2 ]; then + fail "Tracing is not stopped" +fi + +on=`cat tracing_on` +if [ $on != "0" ]; then + fail "Tracing is not off" +fi + +line1=`cat trace | tail -1` +sleep $SLEEP_TIME +line2=`cat trace | tail -1` + +if [ "$line1" != "$line2" ]; then + fail "Tracing file is still changing" +fi + +clear_trace + +cnt=`cnt_trace` +if [ $cnt -ne 0 ]; then + fail "Tracing is still happeing" +fi + +echo "!$func:traceoff" >> set_ftrace_filter + +cnt=`grep schedule set_ftrace_filter | wc -l` +if [ $cnt -ne 0 ]; then + fail "traceoff trigger still exists" +fi + +on=`cat tracing_on` +if [ $on != "0" ]; then + fail "Tracing is started again" +fi + +echo "$func:traceon" > set_ftrace_filter + +cnt=`grep schedule set_ftrace_filter | wc -l` +if [ $cnt -ne 1 ]; then + fail "traceon trigger not found" +fi + +cnt=`cnt_trace` +if [ $cnt -eq 0 ]; then + fail "Tracing did not start" +fi + +on=`cat tracing_on` +if [ $on != "1" ]; then + fail "Tracing was not enabled" +fi + + +echo "!$func:traceon" >> set_ftrace_filter + +cnt=`grep schedule set_ftrace_filter | wc -l` +if [ $cnt -ne 0 ]; then + fail "traceon trigger still exists" +fi + +check_sleep() { + val=$1 + sleep $SLEEP_TIME + cat set_ftrace_filter + on=`cat tracing_on` + if [ $on != "$val" ]; then + fail "Expected tracing_on to be $val, but it was $on" + fi +} + + +echo "$func:traceoff:3" > set_ftrace_filter +check_sleep "0" +echo 1 > tracing_on +check_sleep "0" +echo 1 > tracing_on +check_sleep "0" +echo 1 > tracing_on +check_sleep "1" +echo "!$func:traceoff:0" > set_ftrace_filter + +if grep -e traceon -e traceoff set_ftrace_filter; then + fail "Tracing on and off triggers still exist" +fi + +disable_events + +exit 0 From a9064f676ed6de7bf3a07e26fc19494d48fa4619 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 18 Apr 2017 11:22:38 -0400 Subject: [PATCH 30/60] selftests: ftrace: Add test to test reading of set_ftrace_file The set_ftrace_file lists both functions that are filtered, as well as function probes (triggers) that are attached to a function, like traceon or stacktrace, etc. The reading of this file is not as trivial as most pseudo files are, and there's been various bugs that have appeared in the past when there's a mix of probes and functions listed. There's also a difference when reading the file using dd with a block size of 1. This test performs the following: o Resets set_ftrace_filter o Makes sure only "#### all functions enabled ####" is listed (All checks uses cat, and dd with bs=1 and bs=100) o Adds a traceon trigger to schedule o Checks if only "#### all function enabled ####" and the trigger is there. o Adds tracing of schedule o Checks if only schedule and the trigger is there o Adds tracing of do_IRQ as well o Checks if only schedule, do_IRQ and the trigger is there o Adds a traceon trigger to do_IRQ o Checks if only schedule, do_IRQ and both triggers are there o Removes tracing of do_IRQ o Checks if only schedule and both triggers are there o Removes tracing of schedule o Checks if only "#### all functions enabled ####" and both triggers are there o Removes the triggers o Checks if only "#### all functions enabled ####" is there o Adds tracing of schedule o Checks if only schedule is there o Adds tracing of do_IRQ o Checks if only schedule and do_IRQ are there Signed-off-by: Steven Rostedt (VMware) --- .../test.d/ftrace/func_set_ftrace_file.tc | 132 ++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc new file mode 100644 index 000000000000..113b4d9bc733 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc @@ -0,0 +1,132 @@ +#!/bin/sh +# description: ftrace - test reading of set_ftrace_filter +# +# The set_ftrace_filter file of ftrace is used to list functions as well as +# triggers (probes) attached to functions. The code to read this file is not +# straight forward and has had various bugs in the past. This test is designed +# to add functions and triggers to that file in various ways and read that +# file in various ways (cat vs dd). +# + +# The triggers are set within the set_ftrace_filter file +if [ ! -f set_ftrace_filter ]; then + echo "set_ftrace_filter not found? Is dynamic ftrace not set?" + exit_unsupported +fi + +do_reset() { + reset_tracer + reset_ftrace_filter + disable_events + clear_trace + enable_tracing +} + +fail() { # mesg + do_reset + echo $1 + exit $FAIL +} + +do_reset + +FILTER=set_ftrace_filter +FUNC1="schedule" +FUNC2="do_IRQ" + +ALL_FUNCS="#### all functions enabled ####" + +test_func() { + if ! echo "$1" | grep -q "^$2\$"; then + return 0 + fi + echo "$1" | grep -v "^$2\$" + return 1 +} + +check_set_ftrace_filter() { + cat=`cat $FILTER` + dd1=`dd if=$FILTER bs=1 | grep -v -e 'records in' -e 'records out' -e 'bytes copied'` + dd100=`dd if=$FILTER bs=100 | grep -v -e 'records in' -e 'records out' -e 'bytes copied'` + + echo "Testing '$@'" + + while [ $# -gt 0 ]; do + echo "test $1" + if cat=`test_func "$cat" "$1"`; then + return 0 + fi + if dd1=`test_func "$dd1" "$1"`; then + return 0 + fi + if dd100=`test_func "$dd100" "$1"`; then + return 0 + fi + shift + done + + if [ -n "$cat" ]; then + return 0 + fi + if [ -n "$dd1" ]; then + return 0 + fi + if [ -n "$dd100" ]; then + return 0 + fi + return 1; +} + +if check_set_ftrace_filter "$ALL_FUNCS"; then + fail "Expected only $ALL_FUNCS" +fi + +echo "$FUNC1:traceoff" > set_ftrace_filter +if check_set_ftrace_filter "$ALL_FUNCS" "$FUNC1:traceoff:unlimited"; then + fail "Expected $ALL_FUNCS and $FUNC1:traceoff:unlimited" +fi + +echo "$FUNC1" > set_ftrace_filter +if check_set_ftrace_filter "$FUNC1" "$FUNC1:traceoff:unlimited"; then + fail "Expected $FUNC1 and $FUNC1:traceoff:unlimited" +fi + +echo "$FUNC2" >> set_ftrace_filter +if check_set_ftrace_filter "$FUNC1" "$FUNC2" "$FUNC1:traceoff:unlimited"; then + fail "Expected $FUNC1 $FUNC2 and $FUNC1:traceoff:unlimited" +fi + +echo "$FUNC2:traceoff" >> set_ftrace_filter +if check_set_ftrace_filter "$FUNC1" "$FUNC2" "$FUNC1:traceoff:unlimited" "$FUNC2:traceoff:unlimited"; then + fail "Expected $FUNC1 $FUNC2 $FUNC1:traceoff:unlimited and $FUNC2:traceoff:unlimited" +fi + +echo "$FUNC1" > set_ftrace_filter +if check_set_ftrace_filter "$FUNC1" "$FUNC1:traceoff:unlimited" "$FUNC2:traceoff:unlimited"; then + fail "Expected $FUNC1 $FUNC1:traceoff:unlimited and $FUNC2:traceoff:unlimited" +fi + +echo > set_ftrace_filter +if check_set_ftrace_filter "$ALL_FUNCS" "$FUNC1:traceoff:unlimited" "$FUNC2:traceoff:unlimited"; then + fail "Expected $ALL_FUNCS $FUNC1:traceoff:unlimited and $FUNC2:traceoff:unlimited" +fi + +reset_ftrace_filter + +if check_set_ftrace_filter "$ALL_FUNCS"; then + fail "Expected $ALL_FUNCS" +fi + +echo "$FUNC1" > set_ftrace_filter +if check_set_ftrace_filter "$FUNC1" ; then + fail "Expected $FUNC1" +fi + +echo "$FUNC2" >> set_ftrace_filter +if check_set_ftrace_filter "$FUNC1" "$FUNC2" ; then + fail "Expected $FUNC1 and $FUNC2" +fi + +do_reset + +exit 0 From ec19b85913486993d7d6f747beed1a711afd47d8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 31 Mar 2017 19:01:14 -0400 Subject: [PATCH 31/60] ftrace: Move the probe function into the tracing directory As nothing outside the tracing directory uses the function probes mechanism, I'm moving the prototypes out of the include/linux/ftrace.h and into the local kernel/trace/trace.h header. I plan on making them hook to the trace_array structure which is local to kernel/trace, and I do not want to expose it to the rest of the kernel. This requires that the probe functions must also be local to tracing. But luckily nothing else uses them. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 24 ------------------------ kernel/trace/trace.h | 25 +++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 06b2990a35e4..3e790ff1c501 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -360,30 +360,6 @@ void ftrace_bug(int err, struct dyn_ftrace *rec); struct seq_file; -struct ftrace_probe_ops { - void (*func)(unsigned long ip, - unsigned long parent_ip, - void **data); - int (*init)(struct ftrace_probe_ops *ops, - unsigned long ip, void **data); - void (*free)(struct ftrace_probe_ops *ops, - unsigned long ip, void **data); - int (*print)(struct seq_file *m, - unsigned long ip, - struct ftrace_probe_ops *ops, - void *data); -}; - -extern int -register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data); -extern void -unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data); -extern void -unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); -extern void unregister_ftrace_function_probe_all(char *glob); - extern int ftrace_text_reserved(const void *start, const void *end); extern int ftrace_nr_registered_ops(void); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 31a4997b67c6..2ff6d49fa5ca 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -923,6 +923,31 @@ static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { #endif /* CONFIG_FUNCTION_TRACER */ #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) + +struct ftrace_probe_ops { + void (*func)(unsigned long ip, + unsigned long parent_ip, + void **data); + int (*init)(struct ftrace_probe_ops *ops, + unsigned long ip, void **data); + void (*free)(struct ftrace_probe_ops *ops, + unsigned long ip, void **data); + int (*print)(struct seq_file *m, + unsigned long ip, + struct ftrace_probe_ops *ops, + void *data); +}; + +extern int +register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, + void *data); +extern void +unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, + void *data); +extern void +unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); +extern void unregister_ftrace_function_probe_all(char *glob); + void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent); void ftrace_destroy_filter_files(struct ftrace_ops *ops); From 92a68fa047ca5b8e1991af2d50b23ad9452613cd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 31 Mar 2017 19:21:41 -0400 Subject: [PATCH 32/60] ftrace: Move the function commands into the tracing directory As nothing outside the tracing directory uses the function command mechanism, I'm moving the prototypes out of the include/linux/ftrace.h and into the local kernel/trace/trace.h header. I plan on making them hook to the trace_array structure which is local to kernel/trace, and I do not want to expose it to the rest of the kernel. This requires that the command functions must also be local to tracing. But luckily nothing else uses them. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 19 ------------------- kernel/trace/trace.h | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 3e790ff1c501..774e7a95c201 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -326,14 +326,6 @@ static inline void stack_tracer_disable(void) { } static inline void stack_tracer_enable(void) { } #endif -struct ftrace_func_command { - struct list_head list; - char *name; - int (*func)(struct ftrace_hash *hash, - char *func, char *cmd, - char *params, int enable); -}; - #ifdef CONFIG_DYNAMIC_FTRACE int ftrace_arch_code_modify_prepare(void); @@ -421,9 +413,6 @@ void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); void ftrace_free_filter(struct ftrace_ops *ops); void ftrace_ops_set_global_filter(struct ftrace_ops *ops); -int register_ftrace_command(struct ftrace_func_command *cmd); -int unregister_ftrace_command(struct ftrace_func_command *cmd); - enum { FTRACE_UPDATE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -639,14 +628,6 @@ static inline void ftrace_enable_daemon(void) { } static inline void ftrace_module_init(struct module *mod) { } static inline void ftrace_module_enable(struct module *mod) { } static inline void ftrace_release_mod(struct module *mod) { } -static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) -{ - return -EINVAL; -} -static inline __init int unregister_ftrace_command(char *cmd_name) -{ - return -EINVAL; -} static inline int ftrace_text_reserved(const void *start, const void *end) { return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2ff6d49fa5ca..a63411c53c5e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -880,6 +880,13 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER +struct ftrace_func_command { + struct list_head list; + char *name; + int (*func)(struct ftrace_hash *hash, + char *func, char *cmd, + char *params, int enable); +}; extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { @@ -948,10 +955,23 @@ extern void unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); extern void unregister_ftrace_function_probe_all(char *glob); +int register_ftrace_command(struct ftrace_func_command *cmd); +int unregister_ftrace_command(struct ftrace_func_command *cmd); + void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent); void ftrace_destroy_filter_files(struct ftrace_ops *ops); #else +struct ftrace_func_command; + +static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) +{ + return -EINVAL; +} +static inline __init int unregister_ftrace_command(char *cmd_name) +{ + return -EINVAL; +} /* * The ops parameter passed in is usually undefined. * This must be a macro. From e51a9896794bbb819d89b803e5a8446199034853 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Apr 2017 17:43:49 -0400 Subject: [PATCH 33/60] ftrace: Remove unused "flags" field from struct ftrace_func_probe Nothing uses "flags" in the ftrace_func_probe descriptor. Remove it. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b5ce7ea67e02..b6dc29583c86 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1101,7 +1101,6 @@ static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; struct ftrace_func_probe { struct hlist_node node; struct ftrace_probe_ops *ops; - unsigned long flags; unsigned long ip; void *data; struct list_head free_list; From bca6c8d0480a8aa5c86f8f416db96c71f6b79e29 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Apr 2017 18:18:47 -0400 Subject: [PATCH 34/60] ftrace: Pass probe ops to probe function In preparation to cleaning up the probe function registration code, the "data" parameter will eventually be removed from the probe->func() call. Instead it will receive its own "ops" function, in which it can set up its own data that it needs to map. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.c | 6 ++++-- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 8 +++++--- kernel/trace/trace_functions.c | 24 ++++++++++++++++-------- 5 files changed, 27 insertions(+), 14 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b6dc29583c86..d8873079bed4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3739,7 +3739,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); hlist_for_each_entry_rcu_notrace(entry, hhd, node) { if (entry->ip == ip) - entry->ops->func(ip, parent_ip, &entry->data); + entry->ops->func(ip, parent_ip, entry->ops, &entry->data); } preempt_enable_notrace(); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a5064a03ddf..41e9a20f91f0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6735,13 +6735,15 @@ static const struct file_operations tracing_dyn_info_fops = { #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) static void -ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **data) { tracing_snapshot(); } static void -ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **data) { unsigned long *count = (long *)data; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a63411c53c5e..0f915c264c19 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -934,6 +934,7 @@ static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { struct ftrace_probe_ops { void (*func)(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **data); int (*init)(struct ftrace_probe_ops *ops, unsigned long ip, void **data); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 93116549a284..9dbac1881b03 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2461,7 +2461,8 @@ struct event_probe_data { }; static void -event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +event_enable_probe(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **_data) { struct event_probe_data **pdata = (struct event_probe_data **)_data; struct event_probe_data *data = *pdata; @@ -2476,7 +2477,8 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) } static void -event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **_data) { struct event_probe_data **pdata = (struct event_probe_data **)_data; struct event_probe_data *data = *pdata; @@ -2494,7 +2496,7 @@ event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data if (data->count != -1) (data->count)--; - event_enable_probe(ip, parent_ip, _data); + event_enable_probe(ip, parent_ip, ops, _data); } static int diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 4199ca61b0e5..b99f6231281e 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -326,19 +326,22 @@ static void update_traceon_count(void **data, bool on) } static void -ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **data) { update_traceon_count(data, 1); } static void -ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **data) { update_traceon_count(data, 0); } static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **data) { if (tracing_is_on()) return; @@ -347,7 +350,8 @@ ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) } static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **data) { if (!tracing_is_on()) return; @@ -365,13 +369,15 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) #define STACK_SKIP 4 static void -ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **data) { trace_dump_stack(STACK_SKIP); } static void -ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **data) { long *count = (long *)data; long old_count; @@ -419,7 +425,8 @@ static int update_count(void **data) } static void -ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **data) { if (update_count(data)) ftrace_dump(DUMP_ALL); @@ -427,7 +434,8 @@ ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) /* Only dump the current CPU buffer. */ static void -ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **data) { if (update_count(data)) ftrace_dump(DUMP_ORIG); From 41794f190780c28784fa62b22001691e5876d149 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Apr 2017 20:58:35 -0400 Subject: [PATCH 35/60] ftrace: Added ftrace_func_mapper for function probe triggers In order to move the ops to the function probes directly, they need a way to map function ips to their own data without depending on the infrastructure of the function probes, as the data field will be going away. New helper functions are added that are based on the ftrace_hash code. ftrace_func_mapper functions are there to let the probes map ips to their data. These can be allocated by the probe ops, and referenced in the function callbacks. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 141 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 14 ++++ kernel/trace/trace_events.c | 74 ++++++++++++++----- 3 files changed, 212 insertions(+), 17 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d8873079bed4..ac47d1845fdb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3808,6 +3808,147 @@ static void ftrace_free_entry(struct ftrace_func_probe *entry) kfree(entry); } +struct ftrace_func_map { + struct ftrace_func_entry entry; + void *data; +}; + +struct ftrace_func_mapper { + struct ftrace_hash hash; +}; + +/** + * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper + * + * Returns a ftrace_func_mapper descriptor that can be used to map ips to data. + */ +struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) +{ + struct ftrace_hash *hash; + + /* + * The mapper is simply a ftrace_hash, but since the entries + * in the hash are not ftrace_func_entry type, we define it + * as a separate structure. + */ + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + return (struct ftrace_func_mapper *)hash; +} + +/** + * ftrace_func_mapper_find_ip - Find some data mapped to an ip + * @mapper: The mapper that has the ip maps + * @ip: the instruction pointer to find the data for + * + * Returns the data mapped to @ip if found otherwise NULL. The return + * is actually the address of the mapper data pointer. The address is + * returned for use cases where the data is no bigger than a long, and + * the user can use the data pointer as its data instead of having to + * allocate more memory for the reference. + */ +void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, + unsigned long ip) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (!entry) + return NULL; + + map = (struct ftrace_func_map *)entry; + return &map->data; +} + +/** + * ftrace_func_mapper_add_ip - Map some data to an ip + * @mapper: The mapper that has the ip maps + * @ip: The instruction pointer address to map @data to + * @data: The data to map to @ip + * + * Returns 0 on succes otherwise an error. + */ +int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, + unsigned long ip, void *data) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (entry) + return -EBUSY; + + map = kmalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + map->entry.ip = ip; + map->data = data; + + __add_hash_entry(&mapper->hash, &map->entry); + + return 0; +} + +/** + * ftrace_func_mapper_remove_ip - Remove an ip from the mapping + * @mapper: The mapper that has the ip maps + * @ip: The instruction pointer address to remove the data from + * + * Returns the data if it is found, otherwise NULL. + * Note, if the data pointer is used as the data itself, (see + * ftrace_func_mapper_find_ip(), then the return value may be meaningless, + * if the data pointer was set to zero. + */ +void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, + unsigned long ip) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + void *data; + + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (!entry) + return NULL; + + map = (struct ftrace_func_map *)entry; + data = map->data; + + remove_hash_entry(&mapper->hash, entry); + kfree(entry); + + return data; +} + +/** + * free_ftrace_func_mapper - free a mapping of ips and data + * @mapper: The mapper that has the ip maps + * @free_func: A function to be called on each data item. + * + * This is used to free the function mapper. The @free_func is optional + * and can be used if the data needs to be freed as well. + */ +void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, + ftrace_mapper_func free_func) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + struct hlist_head *hhd; + int size = 1 << mapper->hash.size_bits; + int i; + + if (free_func && mapper->hash.count) { + for (i = 0; i < size; i++) { + hhd = &mapper->hash.buckets[i]; + hlist_for_each_entry(entry, hhd, hlist) { + map = (struct ftrace_func_map *)entry; + free_func(map); + } + } + } + free_ftrace_hash(&mapper->hash); +} + int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0f915c264c19..dbbdee21bcc4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -944,8 +944,22 @@ struct ftrace_probe_ops { unsigned long ip, struct ftrace_probe_ops *ops, void *data); + void *private_data; }; +struct ftrace_func_mapper; +typedef int (*ftrace_mapper_func)(void *data); + +struct ftrace_func_mapper *allocate_ftrace_func_mapper(void); +void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, + unsigned long ip); +int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, + unsigned long ip, void *data); +void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, + unsigned long ip); +void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, + ftrace_mapper_func free_func); + extern int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9dbac1881b03..ee308312fe87 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2460,32 +2460,44 @@ struct event_probe_data { bool enable; }; -static void -event_enable_probe(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **_data) +static void update_event_probe(struct event_probe_data *data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; - - if (!data) - return; - if (data->enable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); else set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); } +static void +event_enable_probe(unsigned long ip, unsigned long parent_ip, + struct ftrace_probe_ops *ops, void **_data) +{ + struct ftrace_func_mapper *mapper = ops->private_data; + struct event_probe_data *data; + void **pdata; + + pdata = ftrace_func_mapper_find_ip(mapper, ip); + if (!pdata || !*pdata) + return; + + data = *pdata; + update_event_probe(data); +} + static void event_enable_count_probe(unsigned long ip, unsigned long parent_ip, struct ftrace_probe_ops *ops, void **_data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct ftrace_func_mapper *mapper = ops->private_data; + struct event_probe_data *data; + void **pdata; - if (!data) + pdata = ftrace_func_mapper_find_ip(mapper, ip); + if (!pdata || !*pdata) return; + data = *pdata; + if (!data->count) return; @@ -2496,14 +2508,23 @@ event_enable_count_probe(unsigned long ip, unsigned long parent_ip, if (data->count != -1) (data->count)--; - event_enable_probe(ip, parent_ip, ops, _data); + update_event_probe(data); } static int event_enable_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *_data) { - struct event_probe_data *data = _data; + struct ftrace_func_mapper *mapper = ops->private_data; + struct event_probe_data *data; + void **pdata; + + pdata = ftrace_func_mapper_find_ip(mapper, ip); + + if (WARN_ON_ONCE(!pdata || !*pdata)) + return 0; + + data = *pdata; seq_printf(m, "%ps:", (void *)ip); @@ -2524,10 +2545,17 @@ static int event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, void **_data) { + struct ftrace_func_mapper *mapper = ops->private_data; struct event_probe_data **pdata = (struct event_probe_data **)_data; struct event_probe_data *data = *pdata; + int ret; + + ret = ftrace_func_mapper_add_ip(mapper, ip, data); + if (ret < 0) + return ret; data->ref++; + return 0; } @@ -2535,8 +2563,13 @@ static void event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, void **_data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct ftrace_func_mapper *mapper = ops->private_data; + struct event_probe_data *data; + + data = ftrace_func_mapper_remove_ip(mapper, ip); + + if (WARN_ON_ONCE(!data)) + return; if (WARN_ON_ONCE(data->ref <= 0)) return; @@ -2548,7 +2581,6 @@ event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, module_put(data->file->event_call->mod); kfree(data); } - *pdata = NULL; } static struct ftrace_probe_ops event_enable_probe_ops = { @@ -2627,6 +2659,13 @@ event_enable_func(struct ftrace_hash *hash, } ret = -ENOMEM; + + if (!ops->private_data) { + ops->private_data = allocate_ftrace_func_mapper(); + if (!ops->private_data) + goto out; + } + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out; @@ -2663,6 +2702,7 @@ event_enable_func(struct ftrace_hash *hash, ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; + ret = register_ftrace_function_probe(glob, ops, data); /* * The above returns on success the # of functions enabled, From 1a93f8bd19bd24f9b41136e70188d8f4de90c6a2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Apr 2017 22:09:43 -0400 Subject: [PATCH 36/60] tracing: Have the snapshot trigger use the mapping helper functions As the data pointer for individual ips will soon be removed and no longer passed to the callback function probe handlers, convert the snapshot trigger counter over to the new ftrace_func_mapper helper functions. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 52 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 41e9a20f91f0..7febeb823c62 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6745,13 +6745,19 @@ static void ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, struct ftrace_probe_ops *ops, void **data) { - unsigned long *count = (long *)data; + struct ftrace_func_mapper *mapper = ops->private_data; + long *count = NULL; - if (!*count) - return; + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) { + + if (*count <= 0) + return; - if (*count != -1) (*count)--; + } tracing_snapshot(); } @@ -6760,20 +6766,42 @@ static int ftrace_snapshot_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - long count = (long)data; + struct ftrace_func_mapper *mapper = ops->private_data; + long *count = NULL; seq_printf(m, "%ps:", (void *)ip); seq_puts(m, "snapshot"); - if (count == -1) - seq_puts(m, ":unlimited\n"); + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); else - seq_printf(m, ":count=%ld\n", count); + seq_puts(m, ":unlimited\n"); return 0; } +static int +ftrace_snapshot_init(struct ftrace_probe_ops *ops, unsigned long ip, + void **data) +{ + struct ftrace_func_mapper *mapper = ops->private_data; + + return ftrace_func_mapper_add_ip(mapper, ip, *data); +} + +static void +ftrace_snapshot_free(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct ftrace_func_mapper *mapper = ops->private_data; + + ftrace_func_mapper_remove_ip(mapper, ip); +} + static struct ftrace_probe_ops snapshot_probe_ops = { .func = ftrace_snapshot, .print = ftrace_snapshot_print, @@ -6782,6 +6810,8 @@ static struct ftrace_probe_ops snapshot_probe_ops = { static struct ftrace_probe_ops snapshot_count_probe_ops = { .func = ftrace_count_snapshot, .print = ftrace_snapshot_print, + .init = ftrace_snapshot_init, + .free = ftrace_snapshot_free, }; static int @@ -6812,6 +6842,12 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, if (!strlen(number)) goto out_reg; + if (!ops->private_data) { + ops->private_data = allocate_ftrace_func_mapper(); + if (!ops->private_data) + return -ENOMEM; + } + /* * We use the callback data field (which is a pointer) * as our counter. From fe014e24b6adfecdf8b3d0de1ef4d55a3eaf2b50 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Apr 2017 23:22:41 -0400 Subject: [PATCH 37/60] ftrace: Convert the rest of the function trigger over to the mapping functions As the data pointer for individual ips will soon be removed and no longer passed to the callback function probe handlers, convert the rest of the function trigger counters over to the new ftrace_func_mapper helper functions. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions.c | 123 +++++++++++++++++++++++---------- 1 file changed, 85 insertions(+), 38 deletions(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b99f6231281e..d9cbde8575a8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -267,10 +267,12 @@ static struct tracer function_trace __tracer_data = }; #ifdef CONFIG_DYNAMIC_FTRACE -static void update_traceon_count(void **data, bool on) +static void update_traceon_count(struct ftrace_probe_ops *ops, + unsigned long ip, bool on) { - long *count = (long *)data; - long old_count = *count; + struct ftrace_func_mapper *mapper = ops->private_data; + long *count; + long old_count; /* * Tracing gets disabled (or enabled) once per count. @@ -301,7 +303,10 @@ static void update_traceon_count(void **data, bool on) * setting the tracing_on file. But we currently don't care * about that. */ - if (!old_count) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + old_count = *count; + + if (old_count <= 0) return; /* Make sure we see count before checking tracing state */ @@ -315,10 +320,6 @@ static void update_traceon_count(void **data, bool on) else tracing_off(); - /* unlimited? */ - if (old_count == -1) - return; - /* Make sure tracing state is visible before updating count */ smp_wmb(); @@ -329,14 +330,14 @@ static void ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, struct ftrace_probe_ops *ops, void **data) { - update_traceon_count(data, 1); + update_traceon_count(ops, ip, 1); } static void ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, struct ftrace_probe_ops *ops, void **data) { - update_traceon_count(data, 0); + update_traceon_count(ops, ip, 0); } static void @@ -379,47 +380,56 @@ static void ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, struct ftrace_probe_ops *ops, void **data) { - long *count = (long *)data; + struct ftrace_func_mapper *mapper = ops->private_data; + long *count; long old_count; long new_count; + if (!tracing_is_on()) + return; + + /* unlimited? */ + if (!mapper) { + trace_dump_stack(STACK_SKIP); + return; + } + + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + /* * Stack traces should only execute the number of times the * user specified in the counter. */ do { - - if (!tracing_is_on()) - return; - old_count = *count; if (!old_count) return; - /* unlimited? */ - if (old_count == -1) { - trace_dump_stack(STACK_SKIP); - return; - } - new_count = old_count - 1; new_count = cmpxchg(count, old_count, new_count); if (new_count == old_count) trace_dump_stack(STACK_SKIP); + if (!tracing_is_on()) + return; + } while (new_count != old_count); } -static int update_count(void **data) +static int update_count(struct ftrace_probe_ops *ops, unsigned long ip) { - unsigned long *count = (long *)data; + struct ftrace_func_mapper *mapper = ops->private_data; + long *count = NULL; - if (!*count) - return 0; + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); - if (*count != -1) + if (count) { + if (*count <= 0) + return 0; (*count)--; + } return 1; } @@ -428,7 +438,7 @@ static void ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, struct ftrace_probe_ops *ops, void **data) { - if (update_count(data)) + if (update_count(ops, ip)) ftrace_dump(DUMP_ALL); } @@ -437,22 +447,26 @@ static void ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, struct ftrace_probe_ops *ops, void **data) { - if (update_count(data)) + if (update_count(ops, ip)) ftrace_dump(DUMP_ORIG); } static int ftrace_probe_print(const char *name, struct seq_file *m, - unsigned long ip, void *data) + unsigned long ip, struct ftrace_probe_ops *ops) { - long count = (long)data; + struct ftrace_func_mapper *mapper = ops->private_data; + long *count = NULL; seq_printf(m, "%ps:%s", (void *)ip, name); - if (count == -1) - seq_puts(m, ":unlimited\n"); + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); else - seq_printf(m, ":count=%ld\n", count); + seq_puts(m, ":unlimited\n"); return 0; } @@ -461,55 +475,82 @@ static int ftrace_traceon_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("traceon", m, ip, data); + return ftrace_probe_print("traceon", m, ip, ops); } static int ftrace_traceoff_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("traceoff", m, ip, data); + return ftrace_probe_print("traceoff", m, ip, ops); } static int ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("stacktrace", m, ip, data); + return ftrace_probe_print("stacktrace", m, ip, ops); } static int ftrace_dump_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("dump", m, ip, data); + return ftrace_probe_print("dump", m, ip, ops); } static int ftrace_cpudump_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("cpudump", m, ip, data); + return ftrace_probe_print("cpudump", m, ip, ops); +} + + +static int +ftrace_count_init(struct ftrace_probe_ops *ops, unsigned long ip, + void **data) +{ + struct ftrace_func_mapper *mapper = ops->private_data; + + return ftrace_func_mapper_add_ip(mapper, ip, *data); +} + +static void +ftrace_count_free(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct ftrace_func_mapper *mapper = ops->private_data; + + ftrace_func_mapper_remove_ip(mapper, ip); } static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops traceoff_count_probe_ops = { .func = ftrace_traceoff_count, .print = ftrace_traceoff_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops stacktrace_count_probe_ops = { .func = ftrace_stacktrace_count, .print = ftrace_stacktrace_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops dump_probe_ops = { .func = ftrace_dump_probe, .print = ftrace_dump_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops cpudump_probe_ops = { @@ -558,6 +599,12 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, if (!strlen(number)) goto out_reg; + if (!ops->private_data) { + ops->private_data = allocate_ftrace_func_mapper(); + if (!ops->private_data) + return -ENOMEM; + } + /* * We use the callback data field (which is a pointer) * as our counter. From 0fe7e7e3f8391d4c9260b41cdb15c7917cb2e5b3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Apr 2017 09:56:26 -0400 Subject: [PATCH 38/60] ftrace: Remove unused unregister_ftrace_function_probe() function Nothing calls unregister_ftrace_function_probe(). Remove it as well as the flag PROBE_TEST_DATA, as this function was the only one to set it. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 18 +++--------------- kernel/trace/trace.h | 3 --- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ac47d1845fdb..5448089e6028 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4061,12 +4061,11 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, enum { PROBE_TEST_FUNC = 1, - PROBE_TEST_DATA = 2 }; static void __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data, int flags) + int flags) { struct ftrace_ops_hash old_hash_ops; struct ftrace_func_entry *rec_entry; @@ -4119,9 +4118,6 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) continue; - if ((flags & PROBE_TEST_DATA) && entry->data != data) - continue; - /* do this last, since it is the most expensive */ if (func_g.search) { kallsyms_lookup(entry->ip, NULL, NULL, @@ -4166,23 +4162,15 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, free_ftrace_hash(hash); } -void -unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data) -{ - __unregister_ftrace_function_probe(glob, ops, data, - PROBE_TEST_FUNC | PROBE_TEST_DATA); -} - void unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) { - __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); + __unregister_ftrace_function_probe(glob, ops, PROBE_TEST_FUNC); } void unregister_ftrace_function_probe_all(char *glob) { - __unregister_ftrace_function_probe(glob, NULL, NULL, 0); + __unregister_ftrace_function_probe(glob, NULL, 0); } static LIST_HEAD(ftrace_commands); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dbbdee21bcc4..507a62e9192e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -964,9 +964,6 @@ extern int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data); extern void -unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data); -extern void unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); extern void unregister_ftrace_function_probe_all(char *glob); From 78f78e07d51e440d01e6b1aef172883821193771 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Apr 2017 13:39:59 -0400 Subject: [PATCH 39/60] ftrace: Remove unused unregister_ftrace_function_probe_all() function There are no users of unregister_ftrace_function_probe_all(). The only probe function that is used is unregister_ftrace_function_probe_func(). Rename the internal static function __unregister_ftrace_function_probe() to unregister_ftrace_function_probe_func() and make it global. Also remove the PROBE_TEST_FUNC as it would be always set. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 22 +++------------------- kernel/trace/trace.h | 1 - 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5448089e6028..1c31c74d0819 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4059,13 +4059,8 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return count; } -enum { - PROBE_TEST_FUNC = 1, -}; - -static void -__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - int flags) +void +unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) { struct ftrace_ops_hash old_hash_ops; struct ftrace_func_entry *rec_entry; @@ -4115,7 +4110,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, hlist_for_each_entry_safe(entry, tmp, hhd, node) { /* break up if statements for readability */ - if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) + if (entry->ops != ops) continue; /* do this last, since it is the most expensive */ @@ -4162,17 +4157,6 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, free_ftrace_hash(hash); } -void -unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) -{ - __unregister_ftrace_function_probe(glob, ops, PROBE_TEST_FUNC); -} - -void unregister_ftrace_function_probe_all(char *glob) -{ - __unregister_ftrace_function_probe(glob, NULL, 0); -} - static LIST_HEAD(ftrace_commands); static DEFINE_MUTEX(ftrace_cmd_mutex); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 507a62e9192e..376d5a798489 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -965,7 +965,6 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data); extern void unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); -extern void unregister_ftrace_function_probe_all(char *glob); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); From 02b77e2afb492420cabb117f3bbd7452d4b4ed06 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Apr 2017 10:04:26 -0400 Subject: [PATCH 40/60] ftrace: Remove printing of data in showing of a function probe None of the probe users uses the data field anymore of the entry. They all have their own print() function. Remove showing the data field in the generic function as the data field will be going away. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1c31c74d0819..15f910a03822 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3154,11 +3154,7 @@ t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); - seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); - - if (rec->data) - seq_printf(m, ":%p", rec->data); - seq_putc(m, '\n'); + seq_printf(m, "%ps:%ps\n", (void *)rec->ip, (void *)rec->ops->func); return 0; } From 1a48df0041c2756194e700affb0e2ff084092e28 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Apr 2017 10:27:51 -0400 Subject: [PATCH 41/60] ftrace: Remove data field from ftrace_func_probe structure No users of the function probes uses the data field anymore. Remove it, and change the init function to take a void *data parameter instead of a void **data, because the init will just get the data that the registering function was received, and there's no state after it is called. The other functions for ftrace_probe_ops still take the data parameter, but it will currently only be passed NULL. It will stay as a parameter for future data to be passed to these functions. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 11 ++++------- kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 2 +- kernel/trace/trace_events.c | 5 ++--- kernel/trace/trace_functions.c | 4 ++-- 5 files changed, 11 insertions(+), 15 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 15f910a03822..f7fcab8f3aa1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1102,7 +1102,6 @@ struct ftrace_func_probe { struct hlist_node node; struct ftrace_probe_ops *ops; unsigned long ip; - void *data; struct list_head free_list; }; @@ -3152,7 +3151,7 @@ t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) return -EIO; if (rec->ops->print) - return rec->ops->print(m, rec->ip, rec->ops, rec->data); + return rec->ops->print(m, rec->ip, rec->ops, NULL); seq_printf(m, "%ps:%ps\n", (void *)rec->ip, (void *)rec->ops->func); @@ -3735,7 +3734,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); hlist_for_each_entry_rcu_notrace(entry, hhd, node) { if (entry->ip == ip) - entry->ops->func(ip, parent_ip, entry->ops, &entry->data); + entry->ops->func(ip, parent_ip, entry->ops, NULL); } preempt_enable_notrace(); } @@ -3800,7 +3799,7 @@ static bool __disable_ftrace_function_probe(void) static void ftrace_free_entry(struct ftrace_func_probe *entry) { if (entry->ops->free) - entry->ops->free(entry->ops, entry->ip, &entry->data); + entry->ops->free(entry->ops, entry->ip, NULL); kfree(entry); } @@ -4007,15 +4006,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, count++; - entry->data = data; - /* * The caller might want to do something special * for each function we find. We call the callback * to give the caller an opportunity to do so. */ if (ops->init) { - if (ops->init(ops, rec->ip, &entry->data) < 0) { + if (ops->init(ops, rec->ip, data) < 0) { /* caller does not like this func */ kfree(entry); continue; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7febeb823c62..7a4d578d8887 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6786,11 +6786,11 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip, static int ftrace_snapshot_init(struct ftrace_probe_ops *ops, unsigned long ip, - void **data) + void *data) { struct ftrace_func_mapper *mapper = ops->private_data; - return ftrace_func_mapper_add_ip(mapper, ip, *data); + return ftrace_func_mapper_add_ip(mapper, ip, data); } static void diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 376d5a798489..86aa5a2222ba 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -937,7 +937,7 @@ struct ftrace_probe_ops { struct ftrace_probe_ops *ops, void **data); int (*init)(struct ftrace_probe_ops *ops, - unsigned long ip, void **data); + unsigned long ip, void *data); void (*free)(struct ftrace_probe_ops *ops, unsigned long ip, void **data); int (*print)(struct seq_file *m, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ee308312fe87..37902107c44f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2543,11 +2543,10 @@ event_enable_print(struct seq_file *m, unsigned long ip, static int event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, - void **_data) + void *_data) { struct ftrace_func_mapper *mapper = ops->private_data; - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct event_probe_data *data = _data; int ret; ret = ftrace_func_mapper_add_ip(mapper, ip, data); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d9cbde8575a8..56d0fe1e4ea1 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -509,11 +509,11 @@ ftrace_cpudump_print(struct seq_file *m, unsigned long ip, static int ftrace_count_init(struct ftrace_probe_ops *ops, unsigned long ip, - void **data) + void *data) { struct ftrace_func_mapper *mapper = ops->private_data; - return ftrace_func_mapper_add_ip(mapper, ip, *data); + return ftrace_func_mapper_add_ip(mapper, ip, data); } static void From e16b35ddb840788e023fac2482b61c0b6bf98057 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Apr 2017 14:46:56 -0400 Subject: [PATCH 42/60] ftrace: Add helper function ftrace_hash_move_and_update_ops() The processes of updating a ops filter_hash is a bit complex, and requires setting up an old hash to perform the update. This is done exactly the same in two locations for the same reasons. Create a helper function that does it in one place. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 105 +++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 52 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f7fcab8f3aa1..5c8d8eea9e7c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3674,6 +3674,56 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) return match_records(hash, buff, len, NULL); } +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_ops_hash *old_hash) +{ + struct ftrace_ops *op; + + if (!ftrace_enabled) + return; + + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); + return; + } + + /* + * If this is the shared global_ops filter, then we need to + * check if there is another ops that shares it, is enabled. + * If so, we still need to run the modify code. + */ + if (ops->func_hash != &global_ops.local_hash) + return; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->func_hash == &global_ops.local_hash && + op->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); + /* Only need to do this once */ + return; + } + } while_for_each_ftrace_op(op); +} + +static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, + struct ftrace_hash **orig_hash, + struct ftrace_hash *hash, + int enable) +{ + struct ftrace_ops_hash old_hash_ops; + struct ftrace_hash *old_hash; + int ret; + + old_hash = *orig_hash; + old_hash_ops.filter_hash = ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; + ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret) { + ftrace_ops_update_code(ops, &old_hash_ops); + free_ftrace_hash_rcu(old_hash); + } + return ret; +} /* * We register the module command as a template to show others how @@ -4306,44 +4356,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } -static void ftrace_ops_update_code(struct ftrace_ops *ops, - struct ftrace_ops_hash *old_hash) -{ - struct ftrace_ops *op; - - if (!ftrace_enabled) - return; - - if (ops->flags & FTRACE_OPS_FL_ENABLED) { - ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); - return; - } - - /* - * If this is the shared global_ops filter, then we need to - * check if there is another ops that shares it, is enabled. - * If so, we still need to run the modify code. - */ - if (ops->func_hash != &global_ops.local_hash) - return; - - do_for_each_ftrace_op(op, ftrace_ops_list) { - if (op->func_hash == &global_ops.local_hash && - op->flags & FTRACE_OPS_FL_ENABLED) { - ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); - /* Only need to do this once */ - return; - } - } while_for_each_ftrace_op(op); -} - static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; - struct ftrace_ops_hash old_hash_ops; - struct ftrace_hash *old_hash; struct ftrace_hash *hash; int ret; @@ -4378,14 +4395,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, } mutex_lock(&ftrace_lock); - old_hash = *orig_hash; - old_hash_ops.filter_hash = ops->func_hash->filter_hash; - old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; - ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret) { - ftrace_ops_update_code(ops, &old_hash_ops); - free_ftrace_hash_rcu(old_hash); - } + ret = ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable); mutex_unlock(&ftrace_lock); out_regex_unlock: @@ -4624,10 +4634,8 @@ static void __init set_ftrace_early_filters(void) int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; - struct ftrace_ops_hash old_hash_ops; struct ftrace_iterator *iter; struct ftrace_hash **orig_hash; - struct ftrace_hash *old_hash; struct trace_parser *parser; int filter_hash; int ret; @@ -4657,15 +4665,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); - old_hash = *orig_hash; - old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash; - old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash; - ret = ftrace_hash_move(iter->ops, filter_hash, - orig_hash, iter->hash); - if (!ret) { - ftrace_ops_update_code(iter->ops, &old_hash_ops); - free_ftrace_hash_rcu(old_hash); - } + ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash, + iter->hash, filter_hash); mutex_unlock(&ftrace_lock); } else { /* For read only, the hash is the ops hash */ From d3d532d798c5720055ab02a10bf7829a33c3645a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Apr 2017 16:44:43 -0400 Subject: [PATCH 43/60] ftrace: Have unregister_ftrace_function_probe_func() return a value Currently unregister_ftrace_function_probe_func() is a void function. It does not give any feedback if an error occurred or no item was found to remove and nothing was done. Change it to return status and success if it removed something. Also update the callers to return that feedback to the user. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 +++++++++++--- kernel/trace/trace.c | 6 ++---- kernel/trace/trace.h | 2 +- kernel/trace/trace_events.c | 3 +-- kernel/trace/trace_functions.c | 6 ++---- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5c8d8eea9e7c..cbae7fb1be15 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4102,7 +4102,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return count; } -void +int unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) { struct ftrace_ops_hash old_hash_ops; @@ -4131,7 +4131,7 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) /* we do not support '!' for function probes */ if (WARN_ON(not)) - return; + return -EINVAL; } mutex_lock(&trace_probe_ops.func_hash->regex_lock); @@ -4140,9 +4140,9 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) /* Probes only have filters */ old_hash_ops.notrace_hash = NULL; + ret = -ENOMEM; hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) - /* Hmm, should report this somehow */ goto out_unlock; INIT_LIST_HEAD(&free_list); @@ -4173,6 +4173,13 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) list_add(&entry->free_list, &free_list); } } + + /* Nothing found? */ + if (list_empty(&free_list)) { + ret = -EINVAL; + goto out_unlock; + } + mutex_lock(&ftrace_lock); disabled = __disable_ftrace_function_probe(); /* @@ -4198,6 +4205,7 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) out_unlock: mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); + return ret; } static LIST_HEAD(ftrace_commands); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7a4d578d8887..64a4418a5106 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6829,10 +6829,8 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; - if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - return 0; - } + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, ops); if (!param) goto out_reg; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 86aa5a2222ba..31d80bff9ee6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -963,7 +963,7 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, extern int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data); -extern void +extern int unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); int register_ftrace_command(struct ftrace_func_command *cmd); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 37902107c44f..9e07a5b3869b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2652,8 +2652,7 @@ event_enable_func(struct ftrace_hash *hash, ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - ret = 0; + ret = unregister_ftrace_function_probe_func(glob+1, ops); goto out; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 56d0fe1e4ea1..dcb4d37ed4bd 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -586,10 +586,8 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, if (!enable) return -EINVAL; - if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - return 0; - } + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, ops); if (!param) goto out_reg; From 1ec3a81a0cf4236b644282794932c4eda9c1714a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Apr 2017 18:16:29 -0400 Subject: [PATCH 44/60] ftrace: Have each function probe use its own ftrace_ops Have the function probes have their own ftrace_ops, and remove the trace_probe_ops. This simplifies some of the ftrace infrastructure code. Individual entries for each function is still allocated for the use of the output for set_ftrace_filter, but they will be removed soon too. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 260 +++++++++++++++++------------------------- kernel/trace/trace.h | 1 + 2 files changed, 106 insertions(+), 155 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cbae7fb1be15..cf6b7263199a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3789,63 +3789,6 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); } -static struct ftrace_ops trace_probe_ops __read_mostly = -{ - .func = function_trace_probe_call, - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(trace_probe_ops) -}; - -static int ftrace_probe_registered; - -static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) -{ - int ret; - int i; - - if (ftrace_probe_registered) { - /* still need to update the function call sites */ - if (ftrace_enabled) - ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, - old_hash); - return; - } - - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - if (hhd->first) - break; - } - /* Nothing registered? */ - if (i == FTRACE_FUNC_HASHSIZE) - return; - - ret = ftrace_startup(&trace_probe_ops, 0); - - ftrace_probe_registered = 1; -} - -static bool __disable_ftrace_function_probe(void) -{ - int i; - - if (!ftrace_probe_registered) - return false; - - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - if (hhd->first) - return false; - } - - /* no more funcs left */ - ftrace_shutdown(&trace_probe_ops, 0); - - ftrace_probe_registered = 0; - return true; -} - - static void ftrace_free_entry(struct ftrace_func_probe *entry) { if (entry->ops->free) @@ -3996,110 +3939,110 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data) + void *data) { - struct ftrace_ops_hash old_hash_ops; - struct ftrace_func_probe *entry; - struct ftrace_glob func_g; - struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; - struct ftrace_hash *old_hash = *orig_hash; + struct ftrace_func_entry *entry; + struct ftrace_func_probe *probe; + struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; struct ftrace_hash *hash; - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int not; + struct hlist_head hl; + struct hlist_node *n; unsigned long key; int count = 0; + int size; int ret; + int i; - func_g.type = filter_parse_regex(glob, strlen(glob), - &func_g.search, ¬); - func_g.len = strlen(func_g.search); - - /* we do not support '!' for function probes */ - if (WARN_ON(not)) + /* We do not support '!' for function probes */ + if (WARN_ON(glob[0] == '!')) return -EINVAL; - mutex_lock(&trace_probe_ops.func_hash->regex_lock); - - old_hash_ops.filter_hash = old_hash; - /* Probes only have filters */ - old_hash_ops.notrace_hash = NULL; - - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); - if (!hash) { - count = -ENOMEM; - goto out; + if (!(ops->ops.flags & FTRACE_OPS_FL_INITIALIZED)) { + ops->ops.func = function_trace_probe_call; + ftrace_ops_init(&ops->ops); } - if (unlikely(ftrace_disabled)) { - count = -ENODEV; + mutex_lock(&ops->ops.func_hash->regex_lock); + + orig_hash = &ops->ops.func_hash->filter_hash; + old_hash = *orig_hash; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + + ret = ftrace_match_records(hash, glob, strlen(glob)); + + /* Nothing found? */ + if (!ret) + ret = -EINVAL; + + if (ret < 0) goto out; + + INIT_HLIST_HEAD(&hl); + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_lookup_ip(old_hash, entry->ip)) + continue; + probe = kmalloc(sizeof(*probe), GFP_KERNEL); + if (!probe) { + count = -ENOMEM; + goto err_free; + } + probe->ops = ops; + probe->ip = entry->ip; + /* + * The caller might want to do something special + * for each function we find. We call the callback + * to give the caller an opportunity to do so. + */ + if (ops->init && ops->init(ops, entry->ip, data) < 0) { + kfree(probe); + goto err_free; + } + hlist_add_head(&probe->node, &hl); + + count++; + } } mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { + ret = ftrace_hash_move_and_update_ops(&ops->ops, orig_hash, + hash, 1); + if (ret < 0) + goto err_free_unlock; - if (rec->flags & FTRACE_FL_DISABLED) - continue; + hlist_for_each_entry_safe(probe, n, &hl, node) { + hlist_del(&probe->node); + key = hash_long(probe->ip, FTRACE_HASH_BITS); + hlist_add_head_rcu(&probe->node, &ftrace_func_hash[key]); + } - if (!ftrace_match_record(rec, &func_g, NULL, 0)) - continue; + if (!(ops->ops.flags & FTRACE_OPS_FL_ENABLED)) + ret = ftrace_startup(&ops->ops, 0); - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) { - /* If we did not process any, then return error */ - if (!count) - count = -ENOMEM; - goto out_unlock; - } - - count++; - - /* - * The caller might want to do something special - * for each function we find. We call the callback - * to give the caller an opportunity to do so. - */ - if (ops->init) { - if (ops->init(ops, rec->ip, data) < 0) { - /* caller does not like this func */ - kfree(entry); - continue; - } - } - - ret = enter_record(hash, rec, 0); - if (ret < 0) { - kfree(entry); - count = ret; - goto out_unlock; - } - - entry->ops = ops; - entry->ip = rec->ip; - - key = hash_long(entry->ip, FTRACE_HASH_BITS); - hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); - - } while_for_each_ftrace_rec(); - - ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); - - __enable_ftrace_function_probe(&old_hash_ops); + mutex_unlock(&ftrace_lock); if (!ret) - free_ftrace_hash_rcu(old_hash); - else - count = ret; - - out_unlock: - mutex_unlock(&ftrace_lock); + ret = count; out: - mutex_unlock(&trace_probe_ops.func_hash->regex_lock); + mutex_unlock(&ops->ops.func_hash->regex_lock); free_ftrace_hash(hash); - return count; + return ret; + + err_free_unlock: + mutex_unlock(&ftrace_lock); + err_free: + hlist_for_each_entry_safe(probe, n, &hl, node) { + hlist_del(&probe->node); + if (ops->free) + ops->free(ops, probe->ip, NULL); + kfree(probe); + } + goto out; } int @@ -4110,14 +4053,16 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) struct ftrace_func_probe *entry; struct ftrace_func_probe *p; struct ftrace_glob func_g; - struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; - struct ftrace_hash *old_hash = *orig_hash; + struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; struct list_head free_list; - struct ftrace_hash *hash; + struct ftrace_hash *hash = NULL; struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; int i, ret; - bool disabled; + + if (!(ops->ops.flags & FTRACE_OPS_FL_INITIALIZED)) + return -EINVAL; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) func_g.search = NULL; @@ -4134,14 +4079,21 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) return -EINVAL; } - mutex_lock(&trace_probe_ops.func_hash->regex_lock); + mutex_lock(&ops->ops.func_hash->regex_lock); + + orig_hash = &ops->ops.func_hash->filter_hash; + old_hash = *orig_hash; + + ret = -EINVAL; + if (ftrace_hash_empty(old_hash)) + goto out_unlock; old_hash_ops.filter_hash = old_hash; /* Probes only have filters */ old_hash_ops.notrace_hash = NULL; ret = -ENOMEM; - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); if (!hash) goto out_unlock; @@ -4181,20 +4133,18 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) } mutex_lock(&ftrace_lock); - disabled = __disable_ftrace_function_probe(); - /* - * Remove after the disable is called. Otherwise, if the last - * probe is removed, a null hash means *all enabled*. - */ - ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + if (ftrace_hash_empty(hash)) + ftrace_shutdown(&ops->ops, 0); + + ret = ftrace_hash_move_and_update_ops(&ops->ops, orig_hash, + hash, 1); /* still need to update the function call sites */ - if (ftrace_enabled && !disabled) - ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + if (ftrace_enabled && !ftrace_hash_empty(hash)) + ftrace_run_modify_code(&ops->ops, FTRACE_UPDATE_CALLS, &old_hash_ops); synchronize_sched(); - if (!ret) - free_ftrace_hash_rcu(old_hash); list_for_each_entry_safe(entry, p, &free_list, free_list) { list_del(&entry->free_list); @@ -4203,7 +4153,7 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&trace_probe_ops.func_hash->regex_lock); + mutex_unlock(&ops->ops.func_hash->regex_lock); free_ftrace_hash(hash); return ret; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 31d80bff9ee6..e16c67c49de4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -932,6 +932,7 @@ static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) struct ftrace_probe_ops { + struct ftrace_ops ops; void (*func)(unsigned long ip, unsigned long parent_ip, struct ftrace_probe_ops *ops, From eee8ded131f15e0f5b1897c9c4a7687fabd28822 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Apr 2017 21:31:28 -0400 Subject: [PATCH 45/60] ftrace: Have the function probes call their own function Now that the function probes have their own ftrace_ops, there's no reason to continue using the ftrace_func_hash to find which probe to call in the function callback. The ops that is passed in to the function callback is part of the probe_ops to call. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 4 +- kernel/trace/ftrace.c | 225 ++++++++++++++++++----------------------- kernel/trace/trace.h | 1 + 3 files changed, 101 insertions(+), 129 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 774e7a95c201..6d2a63e4ea52 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -443,8 +443,8 @@ enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_NOTRACE = (1 << 1), FTRACE_ITER_PRINTALL = (1 << 2), - FTRACE_ITER_DO_HASH = (1 << 3), - FTRACE_ITER_HASH = (1 << 4), + FTRACE_ITER_DO_PROBES = (1 << 3), + FTRACE_ITER_PROBE = (1 << 4), FTRACE_ITER_ENABLED = (1 << 5), }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cf6b7263199a..493c7ff7e860 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1096,14 +1096,7 @@ static bool update_all_ops; # error Dynamic ftrace depends on MCOUNT_RECORD #endif -static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; - -struct ftrace_func_probe { - struct hlist_node node; - struct ftrace_probe_ops *ops; - unsigned long ip; - struct list_head free_list; -}; +static LIST_HEAD(ftrace_func_probes); struct ftrace_func_entry { struct hlist_node hlist; @@ -1270,7 +1263,7 @@ static void remove_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { - hlist_del(&entry->hlist); + hlist_del_rcu(&entry->hlist); hash->count--; } @@ -3063,35 +3056,58 @@ struct ftrace_iterator { loff_t func_pos; struct ftrace_page *pg; struct dyn_ftrace *func; - struct ftrace_func_probe *probe; + struct ftrace_probe_ops *probe; + struct ftrace_func_entry *probe_entry; struct trace_parser parser; struct ftrace_hash *hash; struct ftrace_ops *ops; - int hidx; + int pidx; int idx; unsigned flags; }; static void * -t_hash_next(struct seq_file *m, loff_t *pos) +t_probe_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct ftrace_hash *hash; + struct list_head *next; struct hlist_node *hnd = NULL; struct hlist_head *hhd; + int size; (*pos)++; iter->pos = *pos; - if (iter->probe) - hnd = &iter->probe->node; - retry: - if (iter->hidx >= FTRACE_FUNC_HASHSIZE) + if (list_empty(&ftrace_func_probes)) return NULL; - hhd = &ftrace_func_hash[iter->hidx]; + if (!iter->probe) { + next = ftrace_func_probes.next; + iter->probe = list_entry(next, struct ftrace_probe_ops, list); + } + + if (iter->probe_entry) + hnd = &iter->probe_entry->hlist; + + hash = iter->probe->ops.func_hash->filter_hash; + size = 1 << hash->size_bits; + + retry: + if (iter->pidx >= size) { + if (iter->probe->list.next == &ftrace_func_probes) + return NULL; + next = iter->probe->list.next; + iter->probe = list_entry(next, struct ftrace_probe_ops, list); + hash = iter->probe->ops.func_hash->filter_hash; + size = 1 << hash->size_bits; + iter->pidx = 0; + } + + hhd = &hash->buckets[iter->pidx]; if (hlist_empty(hhd)) { - iter->hidx++; + iter->pidx++; hnd = NULL; goto retry; } @@ -3101,7 +3117,7 @@ t_hash_next(struct seq_file *m, loff_t *pos) else { hnd = hnd->next; if (!hnd) { - iter->hidx++; + iter->pidx++; goto retry; } } @@ -3109,26 +3125,28 @@ t_hash_next(struct seq_file *m, loff_t *pos) if (WARN_ON_ONCE(!hnd)) return NULL; - iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + iter->probe_entry = hlist_entry(hnd, struct ftrace_func_entry, hlist); return iter; } -static void *t_hash_start(struct seq_file *m, loff_t *pos) +static void *t_probe_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; loff_t l; - if (!(iter->flags & FTRACE_ITER_DO_HASH)) + if (!(iter->flags & FTRACE_ITER_DO_PROBES)) return NULL; if (iter->func_pos > *pos) return NULL; - iter->hidx = 0; + iter->probe = NULL; + iter->probe_entry = NULL; + iter->pidx = 0; for (l = 0; l <= (*pos - iter->func_pos); ) { - p = t_hash_next(m, &l); + p = t_probe_next(m, &l); if (!p) break; } @@ -3136,24 +3154,27 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) return NULL; /* Only set this if we have an item */ - iter->flags |= FTRACE_ITER_HASH; + iter->flags |= FTRACE_ITER_PROBE; return iter; } static int -t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) +t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) { - struct ftrace_func_probe *rec; + struct ftrace_probe_ops *probe; + struct ftrace_func_entry *probe_entry; - rec = iter->probe; - if (WARN_ON_ONCE(!rec)) + probe = iter->probe; + probe_entry = iter->probe_entry; + + if (WARN_ON_ONCE(!probe || !probe_entry)) return -EIO; - if (rec->ops->print) - return rec->ops->print(m, rec->ip, rec->ops, NULL); + if (probe->print) + return probe->print(m, probe_entry->ip, probe, NULL); - seq_printf(m, "%ps:%ps\n", (void *)rec->ip, (void *)rec->ops->func); + seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip, (void *)probe->func); return 0; } @@ -3205,19 +3226,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (unlikely(ftrace_disabled)) return NULL; - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, pos); + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_next(m, pos); if (iter->flags & FTRACE_ITER_PRINTALL) { - /* next must increment pos, and t_hash_start does not */ + /* next must increment pos, and t_probe_start does not */ (*pos)++; - return t_hash_start(m, &l); + return t_probe_start(m, &l); } ret = t_func_next(m, pos); if (!ret) - return t_hash_start(m, &l); + return t_probe_start(m, &l); return ret; } @@ -3226,7 +3247,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); } static void *t_start(struct seq_file *m, loff_t *pos) @@ -3255,15 +3276,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) ftrace_hash_empty(iter->hash)) { iter->func_pos = 1; /* Account for the message */ if (*pos > 0) - return t_hash_start(m, pos); + return t_probe_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; /* reset in case of seek/pread */ - iter->flags &= ~FTRACE_ITER_HASH; + iter->flags &= ~FTRACE_ITER_PROBE; return iter; } - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_start(m, pos); + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_start(m, pos); /* * Unfortunately, we need to restart at ftrace_pages_start @@ -3279,7 +3300,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) } if (!p) - return t_hash_start(m, pos); + return t_probe_start(m, pos); return iter; } @@ -3310,8 +3331,8 @@ static int t_show(struct seq_file *m, void *v) struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec; - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, iter); + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { if (iter->flags & FTRACE_ITER_NOTRACE) @@ -3490,7 +3511,7 @@ ftrace_filter_open(struct inode *inode, struct file *file) struct ftrace_ops *ops = inode->i_private; return ftrace_regex_open(ops, - FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES, inode, file); } @@ -3765,16 +3786,9 @@ core_initcall(ftrace_mod_cmd_init); static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct ftrace_func_probe *entry; - struct hlist_head *hhd; - unsigned long key; + struct ftrace_probe_ops *probe_ops; - key = hash_long(ip, FTRACE_HASH_BITS); - - hhd = &ftrace_func_hash[key]; - - if (hlist_empty(hhd)) - return; + probe_ops = container_of(op, struct ftrace_probe_ops, ops); /* * Disable preemption for these calls to prevent a RCU grace @@ -3782,20 +3796,10 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu_notrace(entry, hhd, node) { - if (entry->ip == ip) - entry->ops->func(ip, parent_ip, entry->ops, NULL); - } + probe_ops->func(ip, parent_ip, probe_ops, NULL); preempt_enable_notrace(); } -static void ftrace_free_entry(struct ftrace_func_probe *entry) -{ - if (entry->ops->free) - entry->ops->free(entry->ops, entry->ip, NULL); - kfree(entry); -} - struct ftrace_func_map { struct ftrace_func_entry entry; void *data; @@ -3942,13 +3946,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_entry *entry; - struct ftrace_func_probe *probe; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; struct ftrace_hash *hash; - struct hlist_head hl; - struct hlist_node *n; - unsigned long key; int count = 0; int size; int ret; @@ -3961,6 +3961,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (!(ops->ops.flags & FTRACE_OPS_FL_INITIALIZED)) { ops->ops.func = function_trace_probe_call; ftrace_ops_init(&ops->ops); + INIT_LIST_HEAD(&ops->list); } mutex_lock(&ops->ops.func_hash->regex_lock); @@ -3978,31 +3979,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (ret < 0) goto out; - INIT_HLIST_HEAD(&hl); - size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { if (ftrace_lookup_ip(old_hash, entry->ip)) continue; - probe = kmalloc(sizeof(*probe), GFP_KERNEL); - if (!probe) { - count = -ENOMEM; - goto err_free; - } - probe->ops = ops; - probe->ip = entry->ip; /* * The caller might want to do something special * for each function we find. We call the callback * to give the caller an opportunity to do so. */ - if (ops->init && ops->init(ops, entry->ip, data) < 0) { - kfree(probe); - goto err_free; + if (ops->init) { + ret = ops->init(ops, entry->ip, data); + if (ret < 0) + goto out; } - hlist_add_head(&probe->node, &hl); - count++; } } @@ -4012,17 +4003,15 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ret = ftrace_hash_move_and_update_ops(&ops->ops, orig_hash, hash, 1); if (ret < 0) - goto err_free_unlock; + goto out_unlock; - hlist_for_each_entry_safe(probe, n, &hl, node) { - hlist_del(&probe->node); - key = hash_long(probe->ip, FTRACE_HASH_BITS); - hlist_add_head_rcu(&probe->node, &ftrace_func_hash[key]); - } + if (list_empty(&ops->list)) + list_add(&ops->list, &ftrace_func_probes); if (!(ops->ops.flags & FTRACE_OPS_FL_ENABLED)) ret = ftrace_startup(&ops->ops, 0); + out_unlock: mutex_unlock(&ftrace_lock); if (!ret) @@ -4032,34 +4021,22 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, free_ftrace_hash(hash); return ret; - - err_free_unlock: - mutex_unlock(&ftrace_lock); - err_free: - hlist_for_each_entry_safe(probe, n, &hl, node) { - hlist_del(&probe->node); - if (ops->free) - ops->free(ops, probe->ip, NULL); - kfree(probe); - } - goto out; } int unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) { struct ftrace_ops_hash old_hash_ops; - struct ftrace_func_entry *rec_entry; - struct ftrace_func_probe *entry; - struct ftrace_func_probe *p; + struct ftrace_func_entry *entry; struct ftrace_glob func_g; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; - struct list_head free_list; struct ftrace_hash *hash = NULL; struct hlist_node *tmp; + struct hlist_head hhd; char str[KSYM_SYMBOL_LEN]; int i, ret; + int size; if (!(ops->ops.flags & FTRACE_OPS_FL_INITIALIZED)) return -EINVAL; @@ -4097,18 +4074,12 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) if (!hash) goto out_unlock; - INIT_LIST_HEAD(&free_list); + INIT_HLIST_HEAD(&hhd); - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { - hlist_for_each_entry_safe(entry, tmp, hhd, node) { - - /* break up if statements for readability */ - if (entry->ops != ops) - continue; - - /* do this last, since it is the most expensive */ if (func_g.search) { kallsyms_lookup(entry->ip, NULL, NULL, NULL, str); @@ -4116,26 +4087,24 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) continue; } - rec_entry = ftrace_lookup_ip(hash, entry->ip); - /* It is possible more than one entry had this ip */ - if (rec_entry) - free_hash_entry(hash, rec_entry); - - hlist_del_rcu(&entry->node); - list_add(&entry->free_list, &free_list); + remove_hash_entry(hash, entry); + hlist_add_head(&entry->hlist, &hhd); } } /* Nothing found? */ - if (list_empty(&free_list)) { + if (hlist_empty(&hhd)) { ret = -EINVAL; goto out_unlock; } mutex_lock(&ftrace_lock); - if (ftrace_hash_empty(hash)) + if (ftrace_hash_empty(hash)) { ftrace_shutdown(&ops->ops, 0); + list_del_init(&ops->list); + } + ret = ftrace_hash_move_and_update_ops(&ops->ops, orig_hash, hash, 1); @@ -4146,9 +4115,11 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) &old_hash_ops); synchronize_sched(); - list_for_each_entry_safe(entry, p, &free_list, free_list) { - list_del(&entry->free_list); - ftrace_free_entry(entry); + hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { + hlist_del(&entry->hlist); + if (ops->free) + ops->free(ops, entry->ip, NULL); + kfree(entry); } mutex_unlock(&ftrace_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e16c67c49de4..d457addcc224 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -933,6 +933,7 @@ static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { struct ftrace_probe_ops { struct ftrace_ops ops; + struct list_head list; void (*func)(unsigned long ip, unsigned long parent_ip, struct ftrace_probe_ops *ops, From 8d70725e452cac9796e9025ccd79c45ffcc4d109 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 5 Apr 2017 13:36:18 -0400 Subject: [PATCH 46/60] ftrace: If the hash for a probe fails to update then free what was initialized If the ftrace_hash_move_and_update_ops() fails, and an ops->free() function exists, then it needs to be called on all the ops that were added by this registration. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 493c7ff7e860..8394055e6793 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4003,7 +4003,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ret = ftrace_hash_move_and_update_ops(&ops->ops, orig_hash, hash, 1); if (ret < 0) - goto out_unlock; + goto err_unlock; if (list_empty(&ops->list)) list_add(&ops->list, &ftrace_func_probes); @@ -4021,6 +4021,20 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, free_ftrace_hash(hash); return ret; + + err_unlock: + if (!ops->free) + goto out_unlock; + + /* Failed to do the move, need to call the free functions */ + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_lookup_ip(old_hash, entry->ip)) + continue; + ops->free(ops, entry->ip, NULL); + } + } + goto out_unlock; } int From 04ec7bb642b77374b53731b795b5654b5aff1c00 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 5 Apr 2017 13:12:55 -0400 Subject: [PATCH 47/60] tracing: Have the trace_array hold the list of registered func probes Add a link list to the trace_array to hold func probes that are registered. Currently, all function probes are the same for all instances as it was before, that is, only the top level trace_array holds the function probes. But this lays the ground work to have function probes be attached to individual instances, and having the event trigger only affect events in the given instance. But that work is still to be done. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 41 ++++++++++++++++++++++++---------- kernel/trace/trace.c | 6 +++-- kernel/trace/trace.h | 13 ++++++++--- kernel/trace/trace_events.c | 5 ++--- kernel/trace/trace_functions.c | 21 ++++++++--------- 5 files changed, 56 insertions(+), 30 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8394055e6793..ea208e93f000 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1096,8 +1096,6 @@ static bool update_all_ops; # error Dynamic ftrace depends on MCOUNT_RECORD #endif -static LIST_HEAD(ftrace_func_probes); - struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; @@ -3070,6 +3068,8 @@ static void * t_probe_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct trace_array *tr = global_ops.private; + struct list_head *func_probes; struct ftrace_hash *hash; struct list_head *next; struct hlist_node *hnd = NULL; @@ -3079,11 +3079,15 @@ t_probe_next(struct seq_file *m, loff_t *pos) (*pos)++; iter->pos = *pos; - if (list_empty(&ftrace_func_probes)) + if (!tr) + return NULL; + + func_probes = &tr->func_probes; + if (list_empty(func_probes)) return NULL; if (!iter->probe) { - next = ftrace_func_probes.next; + next = func_probes->next; iter->probe = list_entry(next, struct ftrace_probe_ops, list); } @@ -3095,7 +3099,7 @@ t_probe_next(struct seq_file *m, loff_t *pos) retry: if (iter->pidx >= size) { - if (iter->probe->list.next == &ftrace_func_probes) + if (iter->probe->list.next == func_probes) return NULL; next = iter->probe->list.next; iter->probe = list_entry(next, struct ftrace_probe_ops, list); @@ -3752,7 +3756,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, */ static int -ftrace_mod_callback(struct ftrace_hash *hash, +ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, char *func, char *cmd, char *module, int enable) { int ret; @@ -3942,8 +3946,8 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, } int -register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data) +register_ftrace_function_probe(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_entry *entry; struct ftrace_hash **orig_hash; @@ -3954,6 +3958,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int ret; int i; + if (WARN_ON(!tr)) + return -EINVAL; + /* We do not support '!' for function probes */ if (WARN_ON(glob[0] == '!')) return -EINVAL; @@ -4006,7 +4013,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, goto err_unlock; if (list_empty(&ops->list)) - list_add(&ops->list, &ftrace_func_probes); + list_add(&ops->list, &tr->func_probes); if (!(ops->ops.flags & FTRACE_OPS_FL_ENABLED)) ret = ftrace_startup(&ops->ops, 0); @@ -4192,9 +4199,11 @@ __init int unregister_ftrace_command(struct ftrace_func_command *cmd) return ret; } -static int ftrace_process_regex(struct ftrace_hash *hash, +static int ftrace_process_regex(struct ftrace_iterator *iter, char *buff, int len, int enable) { + struct ftrace_hash *hash = iter->hash; + struct trace_array *tr = global_ops.private; char *func, *command, *next = buff; struct ftrace_func_command *p; int ret = -EINVAL; @@ -4214,10 +4223,13 @@ static int ftrace_process_regex(struct ftrace_hash *hash, command = strsep(&next, ":"); + if (WARN_ON_ONCE(!tr)) + return -EINVAL; + mutex_lock(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(hash, func, command, next, enable); + ret = p->func(tr, hash, func, command, next, enable); goto out_unlock; } } @@ -4254,7 +4266,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (read >= 0 && trace_parser_loaded(parser) && !trace_parser_cont(parser)) { - ret = ftrace_process_regex(iter->hash, parser->buffer, + ret = ftrace_process_regex(iter, parser->buffer, parser->idx, enable); trace_parser_clear(parser); if (ret < 0) @@ -5441,6 +5453,10 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) arch_ftrace_update_trampoline(ops); } +void ftrace_init_trace_array(struct trace_array *tr) +{ + INIT_LIST_HEAD(&tr->func_probes); +} #else static struct ftrace_ops global_ops = { @@ -5495,6 +5511,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr) { tr->ops = &global_ops; tr->ops->private = tr; + ftrace_init_trace_array(tr); } void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 64a4418a5106..86598293787a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6815,7 +6815,7 @@ static struct ftrace_probe_ops snapshot_count_probe_ops = { }; static int -ftrace_trace_snapshot_callback(struct ftrace_hash *hash, +ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -6855,7 +6855,7 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, return ret; out_reg: - ret = register_ftrace_function_probe(glob, ops, count); + ret = register_ftrace_function_probe(glob, tr, ops, count); if (ret >= 0) alloc_snapshot(&global_trace); @@ -7468,6 +7468,8 @@ static int instance_mkdir(const char *name) goto out_free_tr; } + ftrace_init_trace_array(tr); + init_tracer_tracefs(tr, tr->dir); init_trace_flags_index(tr); __update_tracer_options(tr); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d457addcc224..68ff25e4cb19 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -262,6 +262,9 @@ struct trace_array { #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; +#ifdef CONFIG_DYNAMIC_FTRACE + struct list_head func_probes; +#endif /* function tracing enabled */ int function_enabled; #endif @@ -696,6 +699,9 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +void ftrace_init_trace_array(struct trace_array *tr); +#else +static inline void ftrace_init_trace_array(struct trace_array *tr) { } #endif #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); @@ -883,7 +889,8 @@ extern struct list_head ftrace_pids; struct ftrace_func_command { struct list_head list; char *name; - int (*func)(struct ftrace_hash *hash, + int (*func)(struct trace_array *tr, + struct ftrace_hash *hash, char *func, char *cmd, char *params, int enable); }; @@ -963,8 +970,8 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, ftrace_mapper_func free_func); extern int -register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data); +register_ftrace_function_probe(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *ops, void *data); extern int unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9e07a5b3869b..f0d6e5aef53e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2611,10 +2611,9 @@ static struct ftrace_probe_ops event_disable_count_probe_ops = { }; static int -event_enable_func(struct ftrace_hash *hash, +event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enabled) { - struct trace_array *tr = top_trace_array(); struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; @@ -2701,7 +2700,7 @@ event_enable_func(struct ftrace_hash *hash, if (ret < 0) goto out_put; - ret = register_ftrace_function_probe(glob, ops, data); + ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index dcb4d37ed4bd..2c8961b35401 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -574,7 +574,8 @@ static struct ftrace_probe_ops stacktrace_probe_ops = { }; static int -ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, +ftrace_trace_probe_callback(struct trace_array *tr, + struct ftrace_probe_ops *ops, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { @@ -612,13 +613,13 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, return ret; out_reg: - ret = register_ftrace_function_probe(glob, ops, count); + ret = register_ftrace_function_probe(glob, tr, ops, count); return ret < 0 ? ret : 0; } static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, +ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -629,24 +630,24 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, else ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, param, enable); } static int -ftrace_stacktrace_callback(struct ftrace_hash *hash, +ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, param, enable); } static int -ftrace_dump_callback(struct ftrace_hash *hash, +ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -654,12 +655,12 @@ ftrace_dump_callback(struct ftrace_hash *hash, ops = &dump_probe_ops; /* Only dump once. */ - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, "1", enable); } static int -ftrace_cpudump_callback(struct ftrace_hash *hash, +ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -667,7 +668,7 @@ ftrace_cpudump_callback(struct ftrace_hash *hash, ops = &cpudump_probe_ops; /* Only dump once. */ - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, "1", enable); } From b5f081b563a6cdcb85a543df8c851951a8978275 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 10 Apr 2017 22:30:05 -0400 Subject: [PATCH 48/60] tracing: Pass the trace_array into ftrace_probe_ops functions Pass the trace_array associated to a ftrace_probe_ops into the probe_ops func(), init() and free() functions. The trace_array is the descriptor that describes a tracing instance. This will help create the infrastructure that will allow having function probes unique to tracing instances. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 13 +++++++++---- kernel/trace/trace.c | 14 ++++++++------ kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 16 +++++++++------- kernel/trace/trace_functions.c | 35 +++++++++++++++++++++------------- 5 files changed, 51 insertions(+), 30 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ea208e93f000..e51cd6b51253 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3791,6 +3791,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ftrace_probe_ops *probe_ops; + struct trace_array *tr = op->private; probe_ops = container_of(op, struct ftrace_probe_ops, ops); @@ -3800,7 +3801,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - probe_ops->func(ip, parent_ip, probe_ops, NULL); + probe_ops->func(ip, parent_ip, tr, probe_ops, NULL); preempt_enable_notrace(); } @@ -3969,6 +3970,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, ops->ops.func = function_trace_probe_call; ftrace_ops_init(&ops->ops); INIT_LIST_HEAD(&ops->list); + ops->ops.private = tr; } mutex_lock(&ops->ops.func_hash->regex_lock); @@ -3997,7 +3999,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, * to give the caller an opportunity to do so. */ if (ops->init) { - ret = ops->init(ops, entry->ip, data); + ret = ops->init(ops, tr, entry->ip, data); if (ret < 0) goto out; } @@ -4038,7 +4040,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, hlist_for_each_entry(entry, &hash->buckets[i], hlist) { if (ftrace_lookup_ip(old_hash, entry->ip)) continue; - ops->free(ops, entry->ip, NULL); + ops->free(ops, tr, entry->ip, NULL); } } goto out_unlock; @@ -4055,6 +4057,7 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) struct ftrace_hash *hash = NULL; struct hlist_node *tmp; struct hlist_head hhd; + struct trace_array *tr; char str[KSYM_SYMBOL_LEN]; int i, ret; int size; @@ -4062,6 +4065,8 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) if (!(ops->ops.flags & FTRACE_OPS_FL_INITIALIZED)) return -EINVAL; + tr = ops->ops.private; + if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) func_g.search = NULL; else if (glob) { @@ -4139,7 +4144,7 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { hlist_del(&entry->hlist); if (ops->free) - ops->free(ops, entry->ip, NULL); + ops->free(ops, tr, entry->ip, NULL); kfree(entry); } mutex_unlock(&ftrace_lock); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 86598293787a..368310e78d45 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6736,14 +6736,16 @@ static const struct file_operations tracing_dyn_info_fops = { #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) static void ftrace_snapshot(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **data) { tracing_snapshot(); } static void ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **data) { struct ftrace_func_mapper *mapper = ops->private_data; long *count = NULL; @@ -6785,8 +6787,8 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip, } static int -ftrace_snapshot_init(struct ftrace_probe_ops *ops, unsigned long ip, - void *data) +ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) { struct ftrace_func_mapper *mapper = ops->private_data; @@ -6794,8 +6796,8 @@ ftrace_snapshot_init(struct ftrace_probe_ops *ops, unsigned long ip, } static void -ftrace_snapshot_free(struct ftrace_probe_ops *ops, unsigned long ip, - void **_data) +ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void **data) { struct ftrace_func_mapper *mapper = ops->private_data; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 68ff25e4cb19..390761804886 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -943,11 +943,14 @@ struct ftrace_probe_ops { struct list_head list; void (*func)(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, void **data); int (*init)(struct ftrace_probe_ops *ops, + struct trace_array *tr, unsigned long ip, void *data); void (*free)(struct ftrace_probe_ops *ops, + struct trace_array *tr, unsigned long ip, void **data); int (*print)(struct seq_file *m, unsigned long ip, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f0d6e5aef53e..713bec614312 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2470,7 +2470,8 @@ static void update_event_probe(struct event_probe_data *data) static void event_enable_probe(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **_data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **_data) { struct ftrace_func_mapper *mapper = ops->private_data; struct event_probe_data *data; @@ -2486,7 +2487,8 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, static void event_enable_count_probe(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **_data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **_data) { struct ftrace_func_mapper *mapper = ops->private_data; struct event_probe_data *data; @@ -2513,7 +2515,7 @@ event_enable_count_probe(unsigned long ip, unsigned long parent_ip, static int event_enable_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *_data) + struct ftrace_probe_ops *ops, void *_data) { struct ftrace_func_mapper *mapper = ops->private_data; struct event_probe_data *data; @@ -2542,8 +2544,8 @@ event_enable_print(struct seq_file *m, unsigned long ip, } static int -event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, - void *_data) +event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *_data) { struct ftrace_func_mapper *mapper = ops->private_data; struct event_probe_data *data = _data; @@ -2559,8 +2561,8 @@ event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, } static void -event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, - void **_data) +event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void **_data) { struct ftrace_func_mapper *mapper = ops->private_data; struct event_probe_data *data; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 2c8961b35401..797f087183c5 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -328,21 +328,24 @@ static void update_traceon_count(struct ftrace_probe_ops *ops, static void ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **data) { update_traceon_count(ops, ip, 1); } static void ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **data) { update_traceon_count(ops, ip, 0); } static void ftrace_traceon(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **data) { if (tracing_is_on()) return; @@ -352,7 +355,8 @@ ftrace_traceon(unsigned long ip, unsigned long parent_ip, static void ftrace_traceoff(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **data) { if (!tracing_is_on()) return; @@ -371,14 +375,16 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static void ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **data) { trace_dump_stack(STACK_SKIP); } static void ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **data) { struct ftrace_func_mapper *mapper = ops->private_data; long *count; @@ -436,7 +442,8 @@ static int update_count(struct ftrace_probe_ops *ops, unsigned long ip) static void ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **data) { if (update_count(ops, ip)) ftrace_dump(DUMP_ALL); @@ -445,7 +452,8 @@ ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, /* Only dump the current CPU buffer. */ static void ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, - struct ftrace_probe_ops *ops, void **data) + struct trace_array *tr, struct ftrace_probe_ops *ops, + void **data) { if (update_count(ops, ip)) ftrace_dump(DUMP_ORIG); @@ -473,7 +481,8 @@ ftrace_probe_print(const char *name, struct seq_file *m, static int ftrace_traceon_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) + struct ftrace_probe_ops *ops, + void *data) { return ftrace_probe_print("traceon", m, ip, ops); } @@ -508,8 +517,8 @@ ftrace_cpudump_print(struct seq_file *m, unsigned long ip, static int -ftrace_count_init(struct ftrace_probe_ops *ops, unsigned long ip, - void *data) +ftrace_count_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) { struct ftrace_func_mapper *mapper = ops->private_data; @@ -517,8 +526,8 @@ ftrace_count_init(struct ftrace_probe_ops *ops, unsigned long ip, } static void -ftrace_count_free(struct ftrace_probe_ops *ops, unsigned long ip, - void **_data) +ftrace_count_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void **_data) { struct ftrace_func_mapper *mapper = ops->private_data; From 7b60f3d8761561d95d7e962522d6338143fc2329 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 18 Apr 2017 14:50:39 -0400 Subject: [PATCH 49/60] ftrace: Dynamically create the probe ftrace_ops for the trace_array In order to eventually have each trace_array instance have its own unique set of function probes (triggers), the trace array needs to hold the ops and the filters for the probes. This is the first step to accomplish this. Instead of having the private data of the probe ops point to the trace_array, create a separate list that the trace_array holds. There's only one private_data for a probe, we need one per trace_array. The probe ftrace_ops will be dynamically created for each instance, instead of being static. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 194 ++++++++++++++++++++++++--------- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 5 +- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_functions.c | 2 +- 5 files changed, 147 insertions(+), 58 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e51cd6b51253..8fdc18500c61 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1101,6 +1101,14 @@ struct ftrace_func_entry { unsigned long ip; }; +struct ftrace_func_probe { + struct ftrace_probe_ops *probe_ops; + struct ftrace_ops ops; + struct trace_array *tr; + struct list_head list; + int ref; +}; + /* * We make these constant because no one should touch them, * but they are used as the default "empty hash", to avoid allocating @@ -3054,7 +3062,7 @@ struct ftrace_iterator { loff_t func_pos; struct ftrace_page *pg; struct dyn_ftrace *func; - struct ftrace_probe_ops *probe; + struct ftrace_func_probe *probe; struct ftrace_func_entry *probe_entry; struct trace_parser parser; struct ftrace_hash *hash; @@ -3088,7 +3096,7 @@ t_probe_next(struct seq_file *m, loff_t *pos) if (!iter->probe) { next = func_probes->next; - iter->probe = list_entry(next, struct ftrace_probe_ops, list); + iter->probe = list_entry(next, struct ftrace_func_probe, list); } if (iter->probe_entry) @@ -3102,7 +3110,7 @@ t_probe_next(struct seq_file *m, loff_t *pos) if (iter->probe->list.next == func_probes) return NULL; next = iter->probe->list.next; - iter->probe = list_entry(next, struct ftrace_probe_ops, list); + iter->probe = list_entry(next, struct ftrace_func_probe, list); hash = iter->probe->ops.func_hash->filter_hash; size = 1 << hash->size_bits; iter->pidx = 0; @@ -3166,8 +3174,9 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos) static int t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) { - struct ftrace_probe_ops *probe; struct ftrace_func_entry *probe_entry; + struct ftrace_probe_ops *probe_ops; + struct ftrace_func_probe *probe; probe = iter->probe; probe_entry = iter->probe_entry; @@ -3175,10 +3184,13 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) if (WARN_ON_ONCE(!probe || !probe_entry)) return -EIO; - if (probe->print) - return probe->print(m, probe_entry->ip, probe, NULL); + probe_ops = probe->probe_ops; - seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip, (void *)probe->func); + if (probe_ops->print) + return probe_ops->print(m, probe_entry->ip, probe_ops, NULL); + + seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip, + (void *)probe_ops->func); return 0; } @@ -3791,9 +3803,10 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ftrace_probe_ops *probe_ops; - struct trace_array *tr = op->private; + struct ftrace_func_probe *probe; - probe_ops = container_of(op, struct ftrace_probe_ops, ops); + probe = container_of(op, struct ftrace_func_probe, ops); + probe_ops = probe->probe_ops; /* * Disable preemption for these calls to prevent a RCU grace @@ -3801,7 +3814,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - probe_ops->func(ip, parent_ip, tr, probe_ops, NULL); + probe_ops->func(ip, parent_ip, probe->tr, probe_ops, NULL); preempt_enable_notrace(); } @@ -3946,11 +3959,41 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, free_ftrace_hash(&mapper->hash); } +static void release_probe(struct ftrace_func_probe *probe) +{ + struct ftrace_probe_ops *probe_ops; + + mutex_lock(&ftrace_lock); + + WARN_ON(probe->ref <= 0); + + /* Subtract the ref that was used to protect this instance */ + probe->ref--; + + if (!probe->ref) { + probe_ops = probe->probe_ops; + list_del(&probe->list); + kfree(probe); + } + mutex_unlock(&ftrace_lock); +} + +static void acquire_probe_locked(struct ftrace_func_probe *probe) +{ + /* + * Add one ref to keep it from being freed when releasing the + * ftrace_lock mutex. + */ + probe->ref++; +} + int register_ftrace_function_probe(char *glob, struct trace_array *tr, - struct ftrace_probe_ops *ops, void *data) + struct ftrace_probe_ops *probe_ops, + void *data) { struct ftrace_func_entry *entry; + struct ftrace_func_probe *probe; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; struct ftrace_hash *hash; @@ -3966,16 +4009,33 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, if (WARN_ON(glob[0] == '!')) return -EINVAL; - if (!(ops->ops.flags & FTRACE_OPS_FL_INITIALIZED)) { - ops->ops.func = function_trace_probe_call; - ftrace_ops_init(&ops->ops); - INIT_LIST_HEAD(&ops->list); - ops->ops.private = tr; + + mutex_lock(&ftrace_lock); + /* Check if the probe_ops is already registered */ + list_for_each_entry(probe, &tr->func_probes, list) { + if (probe->probe_ops == probe_ops) + break; + } + if (&probe->list == &tr->func_probes) { + probe = kzalloc(sizeof(*probe), GFP_KERNEL); + if (!probe) { + mutex_unlock(&ftrace_lock); + return -ENOMEM; + } + probe->probe_ops = probe_ops; + probe->ops.func = function_trace_probe_call; + probe->tr = tr; + ftrace_ops_init(&probe->ops); + list_add(&probe->list, &tr->func_probes); } - mutex_lock(&ops->ops.func_hash->regex_lock); + acquire_probe_locked(probe); - orig_hash = &ops->ops.func_hash->filter_hash; + mutex_unlock(&ftrace_lock); + + mutex_lock(&probe->ops.func_hash->regex_lock); + + orig_hash = &probe->ops.func_hash->filter_hash; old_hash = *orig_hash; hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); @@ -3998,8 +4058,9 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, * for each function we find. We call the callback * to give the caller an opportunity to do so. */ - if (ops->init) { - ret = ops->init(ops, tr, entry->ip, data); + if (probe_ops->init) { + ret = probe_ops->init(probe_ops, tr, + entry->ip, data); if (ret < 0) goto out; } @@ -4009,16 +4070,22 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, mutex_lock(&ftrace_lock); - ret = ftrace_hash_move_and_update_ops(&ops->ops, orig_hash, - hash, 1); + if (!count) { + /* Nothing was added? */ + ret = -EINVAL; + goto out_unlock; + } + + ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, + hash, 1); if (ret < 0) goto err_unlock; - if (list_empty(&ops->list)) - list_add(&ops->list, &tr->func_probes); + /* One ref for each new function traced */ + probe->ref += count; - if (!(ops->ops.flags & FTRACE_OPS_FL_ENABLED)) - ret = ftrace_startup(&ops->ops, 0); + if (!(probe->ops.flags & FTRACE_OPS_FL_ENABLED)) + ret = ftrace_startup(&probe->ops, 0); out_unlock: mutex_unlock(&ftrace_lock); @@ -4026,13 +4093,15 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, if (!ret) ret = count; out: - mutex_unlock(&ops->ops.func_hash->regex_lock); + mutex_unlock(&probe->ops.func_hash->regex_lock); free_ftrace_hash(hash); + release_probe(probe); + return ret; err_unlock: - if (!ops->free) + if (!probe_ops->free || !count) goto out_unlock; /* Failed to do the move, need to call the free functions */ @@ -4040,33 +4109,30 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, hlist_for_each_entry(entry, &hash->buckets[i], hlist) { if (ftrace_lookup_ip(old_hash, entry->ip)) continue; - ops->free(ops, tr, entry->ip, NULL); + probe_ops->free(probe_ops, tr, entry->ip, NULL); } } goto out_unlock; } int -unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) +unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *probe_ops) { struct ftrace_ops_hash old_hash_ops; struct ftrace_func_entry *entry; + struct ftrace_func_probe *probe; struct ftrace_glob func_g; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; struct ftrace_hash *hash = NULL; struct hlist_node *tmp; struct hlist_head hhd; - struct trace_array *tr; char str[KSYM_SYMBOL_LEN]; - int i, ret; + int count = 0; + int i, ret = -ENODEV; int size; - if (!(ops->ops.flags & FTRACE_OPS_FL_INITIALIZED)) - return -EINVAL; - - tr = ops->ops.private; - if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) func_g.search = NULL; else if (glob) { @@ -4082,12 +4148,28 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) return -EINVAL; } - mutex_lock(&ops->ops.func_hash->regex_lock); - - orig_hash = &ops->ops.func_hash->filter_hash; - old_hash = *orig_hash; + mutex_lock(&ftrace_lock); + /* Check if the probe_ops is already registered */ + list_for_each_entry(probe, &tr->func_probes, list) { + if (probe->probe_ops == probe_ops) + break; + } + if (&probe->list == &tr->func_probes) + goto err_unlock_ftrace; ret = -EINVAL; + if (!(probe->ops.flags & FTRACE_OPS_FL_INITIALIZED)) + goto err_unlock_ftrace; + + acquire_probe_locked(probe); + + mutex_unlock(&ftrace_lock); + + mutex_lock(&probe->ops.func_hash->regex_lock); + + orig_hash = &probe->ops.func_hash->filter_hash; + old_hash = *orig_hash; + if (ftrace_hash_empty(old_hash)) goto out_unlock; @@ -4112,46 +4194,54 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) if (!ftrace_match(str, &func_g)) continue; } - + count++; remove_hash_entry(hash, entry); hlist_add_head(&entry->hlist, &hhd); } } /* Nothing found? */ - if (hlist_empty(&hhd)) { + if (!count) { ret = -EINVAL; goto out_unlock; } mutex_lock(&ftrace_lock); - if (ftrace_hash_empty(hash)) { - ftrace_shutdown(&ops->ops, 0); - list_del_init(&ops->list); - } + WARN_ON(probe->ref < count); + probe->ref -= count; - ret = ftrace_hash_move_and_update_ops(&ops->ops, orig_hash, + if (ftrace_hash_empty(hash)) + ftrace_shutdown(&probe->ops, 0); + + ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, hash, 1); /* still need to update the function call sites */ if (ftrace_enabled && !ftrace_hash_empty(hash)) - ftrace_run_modify_code(&ops->ops, FTRACE_UPDATE_CALLS, + ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS, &old_hash_ops); synchronize_sched(); hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { hlist_del(&entry->hlist); - if (ops->free) - ops->free(ops, tr, entry->ip, NULL); + if (probe_ops->free) + probe_ops->free(probe_ops, tr, entry->ip, NULL); kfree(entry); } mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&ops->ops.func_hash->regex_lock); + mutex_unlock(&probe->ops.func_hash->regex_lock); free_ftrace_hash(hash); + + release_probe(probe); + + return ret; + + err_unlock_ftrace: + mutex_unlock(&ftrace_lock); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 368310e78d45..e61610e5e6e3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6832,7 +6832,7 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; if (glob[0] == '!') - return unregister_ftrace_function_probe_func(glob+1, ops); + return unregister_ftrace_function_probe_func(glob+1, tr, ops); if (!param) goto out_reg; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 390761804886..e978ecd257b8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -939,8 +939,6 @@ static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) struct ftrace_probe_ops { - struct ftrace_ops ops; - struct list_head list; void (*func)(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, @@ -976,7 +974,8 @@ extern int register_ftrace_function_probe(char *glob, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data); extern int -unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); +unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *ops); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 713bec614312..48c7f70cbac7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2653,7 +2653,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; if (glob[0] == '!') { - ret = unregister_ftrace_function_probe_func(glob+1, ops); + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); goto out; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 797f087183c5..b95f56ba9744 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -597,7 +597,7 @@ ftrace_trace_probe_callback(struct trace_array *tr, return -EINVAL; if (glob[0] == '!') - return unregister_ftrace_function_probe_func(glob+1, ops); + return unregister_ftrace_function_probe_func(glob+1, tr, ops); if (!param) goto out_reg; From 6e4443199e5354255e8a4c1e8e5cfc8ef064c3ce Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 19 Apr 2017 22:39:44 -0400 Subject: [PATCH 50/60] tracing/ftrace: Add a better way to pass data via the probe functions With the redesign of the registration and execution of the function probes (triggers), data can now be passed from the setup of the probe to the probe callers that are specific to the trace_array it is on. Although, all probes still only affect the toplevel trace array, this change will allow for instances to have their own probes separated from other instances and the top array. That is, something like the stacktrace probe can be set to trace only in an instance and not the toplevel trace array. This isn't implement yet, but this change sets the ground work for the change. When a probe callback is triggered (someone writes the probe format into set_ftrace_filter), it calls register_ftrace_function_probe() passing in init_data that will be used to initialize the probe. Then for every matching function, register_ftrace_function_probe() will call the probe_ops->init() function with the init data that was passed to it, as well as an address to a place holder that is associated with the probe and the instance. The first occurrence will have a NULL in the pointer. The init() function will then initialize it. If other probes are added, or more functions are part of the probe, the place holder will be passed to the init() function with the place holder data that it was initialized to the last time. Then this place_holder is passed to each of the other probe_ops functions, where it can be used in the function callback. When the probe_ops free() function is called, it can be called either with the rip of the function that is being removed from the probe, or zero, indicating that there are no more functions attached to the probe, and the place holder is about to be freed. This gives the probe_ops a way to free the data it assigned to the place holder if it was allocade during the first init call. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 25 +++++-- kernel/trace/trace.c | 38 ++++++----- kernel/trace/trace.h | 8 +-- kernel/trace/trace_events.c | 116 +++++++++++++++++++-------------- kernel/trace/trace_functions.c | 79 ++++++++++++---------- 5 files changed, 156 insertions(+), 110 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8fdc18500c61..774e9108e5dc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1106,6 +1106,7 @@ struct ftrace_func_probe { struct ftrace_ops ops; struct trace_array *tr; struct list_head list; + void *data; int ref; }; @@ -3187,7 +3188,7 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) probe_ops = probe->probe_ops; if (probe_ops->print) - return probe_ops->print(m, probe_entry->ip, probe_ops, NULL); + return probe_ops->print(m, probe_entry->ip, probe_ops, probe->data); seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip, (void *)probe_ops->func); @@ -3814,7 +3815,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - probe_ops->func(ip, parent_ip, probe->tr, probe_ops, NULL); + probe_ops->func(ip, parent_ip, probe->tr, probe_ops, probe->data); preempt_enable_notrace(); } @@ -3972,6 +3973,12 @@ static void release_probe(struct ftrace_func_probe *probe) if (!probe->ref) { probe_ops = probe->probe_ops; + /* + * Sending zero as ip tells probe_ops to free + * the probe->data itself + */ + if (probe_ops->free) + probe_ops->free(probe_ops, probe->tr, 0, probe->data); list_del(&probe->list); kfree(probe); } @@ -4060,9 +4067,15 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, */ if (probe_ops->init) { ret = probe_ops->init(probe_ops, tr, - entry->ip, data); - if (ret < 0) + entry->ip, data, + &probe->data); + if (ret < 0) { + if (probe_ops->free && count) + probe_ops->free(probe_ops, tr, + 0, probe->data); + probe->data = NULL; goto out; + } } count++; } @@ -4109,7 +4122,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, hlist_for_each_entry(entry, &hash->buckets[i], hlist) { if (ftrace_lookup_ip(old_hash, entry->ip)) continue; - probe_ops->free(probe_ops, tr, entry->ip, NULL); + probe_ops->free(probe_ops, tr, entry->ip, probe->data); } } goto out_unlock; @@ -4227,7 +4240,7 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { hlist_del(&entry->hlist); if (probe_ops->free) - probe_ops->free(probe_ops, tr, entry->ip, NULL); + probe_ops->free(probe_ops, tr, entry->ip, probe->data); kfree(entry); } mutex_unlock(&ftrace_lock); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e61610e5e6e3..18256cd7ad2c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6737,7 +6737,7 @@ static const struct file_operations tracing_dyn_info_fops = { static void ftrace_snapshot(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **data) + void *data) { tracing_snapshot(); } @@ -6745,9 +6745,9 @@ ftrace_snapshot(unsigned long ip, unsigned long parent_ip, static void ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **data) + void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; + struct ftrace_func_mapper *mapper = data; long *count = NULL; if (mapper) @@ -6768,7 +6768,7 @@ static int ftrace_snapshot_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; + struct ftrace_func_mapper *mapper = data; long *count = NULL; seq_printf(m, "%ps:", (void *)ip); @@ -6788,18 +6788,32 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip, static int ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void *data) + unsigned long ip, void *init_data, void **data) { - struct ftrace_func_mapper *mapper = ops->private_data; + struct ftrace_func_mapper *mapper = *data; - return ftrace_func_mapper_add_ip(mapper, ip, data); + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); } static void ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void **data) + unsigned long ip, void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, NULL); + return; + } ftrace_func_mapper_remove_ip(mapper, ip); } @@ -6842,12 +6856,6 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, if (!strlen(number)) goto out_reg; - if (!ops->private_data) { - ops->private_data = allocate_ftrace_func_mapper(); - if (!ops->private_data) - return -ENOMEM; - } - /* * We use the callback data field (which is a pointer) * as our counter. diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e978ecd257b8..8f6754fba778 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -943,18 +943,18 @@ struct ftrace_probe_ops { unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **data); + void *data); int (*init)(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void *data); + unsigned long ip, void *init_data, + void **data); void (*free)(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void **data); + unsigned long ip, void *data); int (*print)(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data); - void *private_data; }; struct ftrace_func_mapper; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 48c7f70cbac7..e7973e10398c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2471,54 +2471,54 @@ static void update_event_probe(struct event_probe_data *data) static void event_enable_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **_data) + void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; - struct event_probe_data *data; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (!pdata || !*pdata) return; - data = *pdata; - update_event_probe(data); + edata = *pdata; + update_event_probe(edata); } static void event_enable_count_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **_data) + void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; - struct event_probe_data *data; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (!pdata || !*pdata) return; - data = *pdata; + edata = *pdata; - if (!data->count) + if (!edata->count) return; /* Skip if the event is in a state we want to switch to */ - if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) + if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; - if (data->count != -1) - (data->count)--; + if (edata->count != -1) + (edata->count)--; - update_event_probe(data); + update_event_probe(edata); } static int event_enable_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *_data) + struct ftrace_probe_ops *ops, void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; - struct event_probe_data *data; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); @@ -2526,62 +2526,84 @@ event_enable_print(struct seq_file *m, unsigned long ip, if (WARN_ON_ONCE(!pdata || !*pdata)) return 0; - data = *pdata; + edata = *pdata; seq_printf(m, "%ps:", (void *)ip); seq_printf(m, "%s:%s:%s", - data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, - data->file->event_call->class->system, - trace_event_name(data->file->event_call)); + edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + edata->file->event_call->class->system, + trace_event_name(edata->file->event_call)); - if (data->count == -1) + if (edata->count == -1) seq_puts(m, ":unlimited\n"); else - seq_printf(m, ":count=%ld\n", data->count); + seq_printf(m, ":count=%ld\n", edata->count); return 0; } static int event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void *_data) + unsigned long ip, void *init_data, void **data) { - struct ftrace_func_mapper *mapper = ops->private_data; - struct event_probe_data *data = _data; + struct ftrace_func_mapper *mapper = *data; + struct event_probe_data *edata = init_data; int ret; - ret = ftrace_func_mapper_add_ip(mapper, ip, data); + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENODEV; + *data = mapper; + } + + ret = ftrace_func_mapper_add_ip(mapper, ip, edata); if (ret < 0) return ret; - data->ref++; + edata->ref++; return 0; } +static int free_probe_data(void *data) +{ + struct event_probe_data *edata = data; + + edata->ref--; + if (!edata->ref) { + /* Remove the SOFT_MODE flag */ + __ftrace_event_enable_disable(edata->file, 0, 1); + module_put(edata->file->event_call->mod); + kfree(edata); + } + return 0; +} + static void event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void **_data) + unsigned long ip, void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; - struct event_probe_data *data; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; - data = ftrace_func_mapper_remove_ip(mapper, ip); - - if (WARN_ON_ONCE(!data)) + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, free_probe_data); return; - - if (WARN_ON_ONCE(data->ref <= 0)) - return; - - data->ref--; - if (!data->ref) { - /* Remove the SOFT_MODE flag */ - __ftrace_event_enable_disable(data->file, 0, 1); - module_put(data->file->event_call->mod); - kfree(data); } + + edata = ftrace_func_mapper_remove_ip(mapper, ip); + + if (WARN_ON_ONCE(!edata)) + return; + + if (WARN_ON_ONCE(edata->ref <= 0)) + return; + + free_probe_data(edata); } static struct ftrace_probe_ops event_enable_probe_ops = { @@ -2659,12 +2681,6 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, ret = -ENOMEM; - if (!ops->private_data) { - ops->private_data = allocate_ftrace_func_mapper(); - if (!ops->private_data) - goto out; - } - data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b95f56ba9744..7775e1ca5bad 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -268,9 +268,10 @@ static struct tracer function_trace __tracer_data = #ifdef CONFIG_DYNAMIC_FTRACE static void update_traceon_count(struct ftrace_probe_ops *ops, - unsigned long ip, bool on) + unsigned long ip, bool on, + void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; + struct ftrace_func_mapper *mapper = data; long *count; long old_count; @@ -329,23 +330,23 @@ static void update_traceon_count(struct ftrace_probe_ops *ops, static void ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **data) + void *data) { - update_traceon_count(ops, ip, 1); + update_traceon_count(ops, ip, 1, data); } static void ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **data) + void *data) { - update_traceon_count(ops, ip, 0); + update_traceon_count(ops, ip, 0, data); } static void ftrace_traceon(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **data) + void *data) { if (tracing_is_on()) return; @@ -356,7 +357,7 @@ ftrace_traceon(unsigned long ip, unsigned long parent_ip, static void ftrace_traceoff(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **data) + void *data) { if (!tracing_is_on()) return; @@ -376,7 +377,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static void ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **data) + void *data) { trace_dump_stack(STACK_SKIP); } @@ -384,9 +385,9 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, static void ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **data) + void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; + struct ftrace_func_mapper *mapper = data; long *count; long old_count; long new_count; @@ -423,9 +424,10 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, } while (new_count != old_count); } -static int update_count(struct ftrace_probe_ops *ops, unsigned long ip) +static int update_count(struct ftrace_probe_ops *ops, unsigned long ip, + void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; + struct ftrace_func_mapper *mapper = data; long *count = NULL; if (mapper) @@ -443,9 +445,9 @@ static int update_count(struct ftrace_probe_ops *ops, unsigned long ip) static void ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **data) + void *data) { - if (update_count(ops, ip)) + if (update_count(ops, ip, data)) ftrace_dump(DUMP_ALL); } @@ -453,17 +455,18 @@ ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, static void ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, - void **data) + void *data) { - if (update_count(ops, ip)) + if (update_count(ops, ip, data)) ftrace_dump(DUMP_ORIG); } static int ftrace_probe_print(const char *name, struct seq_file *m, - unsigned long ip, struct ftrace_probe_ops *ops) + unsigned long ip, struct ftrace_probe_ops *ops, + void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; + struct ftrace_func_mapper *mapper = data; long *count = NULL; seq_printf(m, "%ps:%s", (void *)ip, name); @@ -484,52 +487,64 @@ ftrace_traceon_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("traceon", m, ip, ops); + return ftrace_probe_print("traceon", m, ip, ops, data); } static int ftrace_traceoff_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("traceoff", m, ip, ops); + return ftrace_probe_print("traceoff", m, ip, ops, data); } static int ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("stacktrace", m, ip, ops); + return ftrace_probe_print("stacktrace", m, ip, ops, data); } static int ftrace_dump_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("dump", m, ip, ops); + return ftrace_probe_print("dump", m, ip, ops, data); } static int ftrace_cpudump_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("cpudump", m, ip, ops); + return ftrace_probe_print("cpudump", m, ip, ops, data); } static int ftrace_count_init(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void *data) + unsigned long ip, void *init_data, void **data) { - struct ftrace_func_mapper *mapper = ops->private_data; + struct ftrace_func_mapper *mapper = *data; - return ftrace_func_mapper_add_ip(mapper, ip, data); + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); } static void ftrace_count_free(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void **_data) + unsigned long ip, void *data) { - struct ftrace_func_mapper *mapper = ops->private_data; + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + free_ftrace_func_mapper(mapper, NULL); + return; + } ftrace_func_mapper_remove_ip(mapper, ip); } @@ -607,12 +622,6 @@ ftrace_trace_probe_callback(struct trace_array *tr, if (!strlen(number)) goto out_reg; - if (!ops->private_data) { - ops->private_data = allocate_ftrace_func_mapper(); - if (!ops->private_data) - return -ENOMEM; - } - /* * We use the callback data field (which is a pointer) * as our counter. From d2afd57a4b96f3824220bbb8c067558ca215543f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Apr 2017 11:31:35 -0400 Subject: [PATCH 51/60] tracing/ftrace: Allow instances to have their own function probes Pass around the local trace_array that is the descriptor for tracing instances, when enabling and disabling probes. This by default sets the enable/disable of event probe triggers to work with instances. The other probes will need some more work to get them working with instances. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 774e9108e5dc..6615197e6597 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3077,7 +3077,7 @@ static void * t_probe_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct trace_array *tr = global_ops.private; + struct trace_array *tr = iter->ops->private; struct list_head *func_probes; struct ftrace_hash *hash; struct list_head *next; @@ -4311,7 +4311,7 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, char *buff, int len, int enable) { struct ftrace_hash *hash = iter->hash; - struct trace_array *tr = global_ops.private; + struct trace_array *tr = iter->ops->private; char *func, *command, *next = buff; struct ftrace_func_command *p; int ret = -EINVAL; From cab5037950821caa1301df0223de657c6ee202a8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Apr 2017 11:34:06 -0400 Subject: [PATCH 52/60] tracing/ftrace: Enable snapshot function trigger to work with instances Modify the snapshot probe trigger to work with instances. This way the snapshot function trigger will only affect the instance that it is added to in the set_ftrace_filter file. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 18256cd7ad2c..57e9c546bebb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -894,23 +894,8 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -/** - * trace_snapshot - take a snapshot of the current buffer. - * - * This causes a swap between the snapshot buffer and the current live - * tracing buffer. You can use this to take snapshots of the live - * trace when some condition is triggered, but continue to trace. - * - * Note, make sure to allocate the snapshot with either - * a tracing_snapshot_alloc(), or by doing it manually - * with: echo 1 > /sys/kernel/debug/tracing/snapshot - * - * If the snapshot buffer is not allocated, it will stop tracing. - * Basically making a permanent snapshot. - */ -void tracing_snapshot(void) +static void tracing_snapshot_instance(struct trace_array *tr) { - struct trace_array *tr = &global_trace; struct tracer *tracer = tr->current_trace; unsigned long flags; @@ -938,6 +923,27 @@ void tracing_snapshot(void) update_max_tr(tr, current, smp_processor_id()); local_irq_restore(flags); } + +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ + struct trace_array *tr = &global_trace; + + tracing_snapshot_instance(tr); +} EXPORT_SYMBOL_GPL(tracing_snapshot); static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, @@ -6739,7 +6745,7 @@ ftrace_snapshot(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { - tracing_snapshot(); + tracing_snapshot_instance(tr); } static void @@ -6761,7 +6767,7 @@ ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, (*count)--; } - tracing_snapshot(); + tracing_snapshot_instance(tr); } static int @@ -6868,7 +6874,7 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, ret = register_ftrace_function_probe(glob, tr, ops, count); if (ret >= 0) - alloc_snapshot(&global_trace); + alloc_snapshot(tr); return ret < 0 ? ret : 0; } From 2290f2c589285d0031e3b7445afff8949f3fdbb6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Apr 2017 11:46:03 -0400 Subject: [PATCH 53/60] tracing/ftrace: Allow for the traceonoff probe be unique to instances Have the traceon/off function probe triggers affect only the instance they are set in. This required making the trace_on/off accessible for other files in the tracing directory. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 2 ++ kernel/trace/trace_functions.c | 21 +++++++++++---------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 57e9c546bebb..60c904fa5480 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -757,7 +757,7 @@ __trace_buffer_lock_reserve(struct ring_buffer *buffer, return event; } -static void tracer_tracing_on(struct trace_array *tr) +void tracer_tracing_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_on(tr->trace_buffer.buffer); @@ -1045,7 +1045,7 @@ void tracing_snapshot_alloc(void) EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); #endif /* CONFIG_TRACER_SNAPSHOT */ -static void tracer_tracing_off(struct trace_array *tr) +void tracer_tracing_off(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_off(tr->trace_buffer.buffer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8f6754fba778..bc011c1f3d71 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -582,6 +582,8 @@ void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); int tracer_tracing_is_on(struct trace_array *tr); +void tracer_tracing_on(struct trace_array *tr); +void tracer_tracing_off(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 7775e1ca5bad..8c30ca733a5c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -268,7 +268,8 @@ static struct tracer function_trace __tracer_data = #ifdef CONFIG_DYNAMIC_FTRACE static void update_traceon_count(struct ftrace_probe_ops *ops, - unsigned long ip, bool on, + unsigned long ip, + struct trace_array *tr, bool on, void *data) { struct ftrace_func_mapper *mapper = data; @@ -313,13 +314,13 @@ static void update_traceon_count(struct ftrace_probe_ops *ops, /* Make sure we see count before checking tracing state */ smp_rmb(); - if (on == !!tracing_is_on()) + if (on == !!tracer_tracing_is_on(tr)) return; if (on) - tracing_on(); + tracer_tracing_on(tr); else - tracing_off(); + tracer_tracing_off(tr); /* Make sure tracing state is visible before updating count */ smp_wmb(); @@ -332,7 +333,7 @@ ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { - update_traceon_count(ops, ip, 1, data); + update_traceon_count(ops, ip, tr, 1, data); } static void @@ -340,7 +341,7 @@ ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { - update_traceon_count(ops, ip, 0, data); + update_traceon_count(ops, ip, tr, 0, data); } static void @@ -348,10 +349,10 @@ ftrace_traceon(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { - if (tracing_is_on()) + if (tracer_tracing_is_on(tr)) return; - tracing_on(); + tracer_tracing_on(tr); } static void @@ -359,10 +360,10 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { - if (!tracing_is_on()) + if (!tracer_tracing_is_on(tr)) return; - tracing_off(); + tracer_tracing_off(tr); } /* From dcc19d28091a86d5baf78e3fbb32e3fc3de524be Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Apr 2017 11:59:18 -0400 Subject: [PATCH 54/60] tracing/ftrace: Allow for instances to trigger their own stacktrace probes Have the stacktrace function trigger probe trigger stack traces within the instance that they were added to in the set_ftrace_filter. ># cd /sys/kernel/debug/tracing ># mkdir instances/foo ># cd instances/foo ># echo schedule:stacktrace:1 > set_ftrace_filter ># cat trace # tracer: nop # # entries-in-buffer/entries-written: 1/1 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | -0 [001] .N.2 202.585010: => => schedule => schedule_preempt_disabled => do_idle => cpu_startup_entry => start_secondary => verify_cpu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8c30ca733a5c..a3bddbfd0874 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -375,12 +375,23 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, */ #define STACK_SKIP 4 +static __always_inline void trace_stack(struct trace_array *tr) +{ + unsigned long flags; + int pc; + + local_save_flags(flags); + pc = preempt_count(); + + __trace_stack(tr, flags, STACK_SKIP, pc); +} + static void ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { - trace_dump_stack(STACK_SKIP); + trace_stack(tr); } static void @@ -398,7 +409,7 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, /* unlimited? */ if (!mapper) { - trace_dump_stack(STACK_SKIP); + trace_stack(tr); return; } @@ -417,7 +428,7 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, new_count = old_count - 1; new_count = cmpxchg(count, old_count, new_count); if (new_count == old_count) - trace_dump_stack(STACK_SKIP); + trace_stack(tr); if (!tracing_is_on()) return; From b5b77be812de78265e794523bf4e969fadf581be Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Apr 2017 12:53:18 -0400 Subject: [PATCH 55/60] selftests: ftrace: Allow some tests to be run in a tracing instance An tracing instance has several of the same capabilities as the top level instance, but may be implemented slightly different. Instead of just writing tests that duplicat the same test cases of the top level instance, allow a test to be written for both the top level as well as for an instance. If a test case can be run in both the top level as well as in an tracing instance directory, then it should add a tag "# flags: instance" in the header of the test file. Then after all tests have run, any test that has an instance flag set, will run again within a tracing instance. Link: http://lkml.kernel.org/r/20170421233850.1d0e9e05@gandalf.local.home Cc: Shuah Khan Acked-by: Namhyung Kim Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/ftrace/ftracetest | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index a8631d978725..32e6211e1c6e 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -150,11 +150,16 @@ XFAILED_CASES= UNDEFINED_CASES= TOTAL_RESULT=0 +INSTANCE= CASENO=0 testcase() { # testfile CASENO=$((CASENO+1)) desc=`grep "^#[ \t]*description:" $1 | cut -f2 -d:` - prlog -n "[$CASENO]$desc" + prlog -n "[$CASENO]$INSTANCE$desc" +} + +test_on_instance() { # testfile + grep -q "^#[ \t]*flags:.*instance" $1 } eval_result() { # sigval @@ -271,6 +276,17 @@ for t in $TEST_CASES; do run_test $t done +# Test on instance loop +INSTANCE=" (instance) " +for t in $TEST_CASES; do + test_on_instance $t || continue + SAVED_TRACING_DIR=$TRACING_DIR + export TRACING_DIR=`mktemp -d $TRACING_DIR/instances/ftracetest.XXXXXX` + run_test $t + rmdir $TRACING_DIR + TRACING_DIR=$SAVED_TRACING_DIR +done + prlog "" prlog "# of passed: " `echo $PASSED_CASES | wc -w` prlog "# of failed: " `echo $FAILED_CASES | wc -w` From 6df0fee3e8ab6a612a971c5f837e61149448eae8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Apr 2017 13:32:56 -0400 Subject: [PATCH 56/60] selftests: ftrace: Make func_event_triggers and func_traceonoff_triggers tests do instances Both the func_event_triggers and func_traceonoff_triggers tests can be performed in both the toplevel instance as well as for individual instances. Have their tests run in both cases. Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- .../selftests/ftrace/test.d/ftrace/func_event_triggers.tc | 1 + .../selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc | 1 + 2 files changed, 2 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc index 5c60afca24a6..07bb3e5930b4 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: ftrace - test for function event triggers +# flags: instance # # Ftrace allows to add triggers to functions, such as enabling or disabling # tracing, enabling or disabling trace events, or recording a stack trace diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc index 3c60ca61fee3..c8e02ec01eaf 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: ftrace - test for function traceon/off triggers +# flags: instance # # Ftrace allows to add triggers to functions, such as enabling or disabling # tracing, enabling or disabling trace events, or recording a stack trace From 35df6a894e5b282f725743a8cb9dd7ed8d63e320 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Apr 2017 13:39:40 -0400 Subject: [PATCH 57/60] selftests: ftrace: Have event tests also run in an tracing instance The ftrace selftests of events: event-enable, event-pid, and subsystem-enable can all be run inside an instance. Change their tests to do both a toplevel run an an instance run. Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/ftrace/test.d/event/event-enable.tc | 1 + tools/testing/selftests/ftrace/test.d/event/event-pid.tc | 1 + tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc | 1 + 3 files changed, 3 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/event/event-enable.tc b/tools/testing/selftests/ftrace/test.d/event/event-enable.tc index 87eb9d6dd4ca..283b45ecb199 100644 --- a/tools/testing/selftests/ftrace/test.d/event/event-enable.tc +++ b/tools/testing/selftests/ftrace/test.d/event/event-enable.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: event tracing - enable/disable with event level files +# flags: instance do_reset() { echo > set_event diff --git a/tools/testing/selftests/ftrace/test.d/event/event-pid.tc b/tools/testing/selftests/ftrace/test.d/event/event-pid.tc index d4ab27b522f8..96c1a95be4f7 100644 --- a/tools/testing/selftests/ftrace/test.d/event/event-pid.tc +++ b/tools/testing/selftests/ftrace/test.d/event/event-pid.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: event tracing - restricts events based on pid +# flags: instance do_reset() { echo > set_event diff --git a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc index ced27ef0638f..b8fe2e5b9e67 100644 --- a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc +++ b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: event tracing - enable/disable with subsystem level files +# flags: instance do_reset() { echo > set_event From 03c201759e9726ca6abaf8cabac936fd62c2543f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Apr 2017 13:44:32 -0400 Subject: [PATCH 58/60] selftests: ftrace: Have some basic tests run in a tracing instance too Some of the basic ftrace selftests should also be run in an instance. These test a quick case of running all tracers in the available_tracers file within the instance. The other is testing the clock used for the instance. Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/ftrace/test.d/00basic/basic2.tc | 1 + tools/testing/selftests/ftrace/test.d/00basic/basic3.tc | 1 + 2 files changed, 2 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/00basic/basic2.tc b/tools/testing/selftests/ftrace/test.d/00basic/basic2.tc index bf9a7b037924..ebfce83f35b4 100644 --- a/tools/testing/selftests/ftrace/test.d/00basic/basic2.tc +++ b/tools/testing/selftests/ftrace/test.d/00basic/basic2.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: Basic test for tracers +# flags: instance test -f available_tracers for t in `cat available_tracers`; do echo $t > current_tracer diff --git a/tools/testing/selftests/ftrace/test.d/00basic/basic3.tc b/tools/testing/selftests/ftrace/test.d/00basic/basic3.tc index bde6625d9785..9e33f841812f 100644 --- a/tools/testing/selftests/ftrace/test.d/00basic/basic3.tc +++ b/tools/testing/selftests/ftrace/test.d/00basic/basic3.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: Basic trace clock test +# flags: instance test -f trace_clock for c in `cat trace_clock | tr -d \[\]`; do echo $c > trace_clock From ca2958f14c4706d5dced95f4f7dfe2bdd1b268de Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Apr 2017 14:20:19 -0400 Subject: [PATCH 59/60] selftests: ftrace: Allow some event trigger tests to run in an instance Some of the event triggers can run fine in an instance. Have them tested in one as well. The ones that still need work are the snapshot, stacktrace and traceon/off triggers, as they don't currently pass a handle to the trace_array they are attached to. But that can be for a future project. Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- .../selftests/ftrace/test.d/trigger/trigger-eventonoff.tc | 1 + tools/testing/selftests/ftrace/test.d/trigger/trigger-filter.tc | 1 + .../testing/selftests/ftrace/test.d/trigger/trigger-hist-mod.tc | 1 + tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc | 1 + .../testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc | 1 + 5 files changed, 5 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-eventonoff.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-eventonoff.tc index 1a9445021bf1..c5435adfdd93 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/trigger-eventonoff.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-eventonoff.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: event trigger - test event enable/disable trigger +# flags: instance do_reset() { reset_trigger diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-filter.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-filter.tc index 514e466e198b..48849a8d577f 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/trigger-filter.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-filter.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: event trigger - test trigger filter +# flags: instance do_reset() { reset_trigger diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-mod.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-mod.tc index 400e98b64948..b7f86d10b549 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-mod.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-mod.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: event trigger - test histogram modifiers +# flags: instance do_reset() { reset_trigger diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc index a00184cd9c95..fb66f7d9339d 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: event trigger - test histogram trigger +# flags: instance do_reset() { reset_trigger diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc index 3478b00ead57..f9153087dd7c 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc @@ -1,5 +1,6 @@ #!/bin/sh # description: event trigger - test multiple histogram triggers +# flags: instance do_reset() { reset_trigger From 73a757e63114dfd765f1c5d1ff7e994f123d0234 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 1 May 2017 09:35:09 -0400 Subject: [PATCH 60/60] ring-buffer: Return reader page back into existing ring buffer When reading the ring buffer for consuming, it is optimized for splice, where a page is taken out of the ring buffer (zero copy) and sent to the reading consumer. When the read is finished with the page, it calls ring_buffer_free_read_page(), which simply frees the page. The next time the reader needs to get a page from the ring buffer, it must call ring_buffer_alloc_read_page() which allocates and initializes a reader page for the ring buffer to be swapped into the ring buffer for a new filled page for the reader. The problem is that there's no reason to actually free the page when it is passed back to the ring buffer. It can hold it off and reuse it for the next iteration. This completely removes the interaction with the page_alloc mechanism. Using the trace-cmd utility to record all events (causing trace-cmd to require reading lots of pages from the ring buffer, and calling ring_buffer_alloc/free_read_page() several times), and also assigning a stack trace trigger to the mm_page_alloc event, we can see how many times the ring_buffer_alloc_read_page() needed to allocate a page for the ring buffer. Before this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 9968 After this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 4 Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 40 +++++++++++++++++++++++++--- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 17 ++++++++---- 4 files changed, 51 insertions(+), 10 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b6d4568795a7..ee9b461af095 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -185,7 +185,7 @@ size_t ring_buffer_page_len(void *page); void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu); -void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); +void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data); int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, size_t len, int cpu, int full); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 96fc3c043ad6..01b4ee5326cf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -438,6 +438,7 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; + struct buffer_data_page *free_page; unsigned long nr_pages; unsigned int current_context; struct list_head *pages; @@ -4377,9 +4378,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { - struct buffer_data_page *bpage; + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct buffer_data_page *bpage = NULL; + unsigned long flags; struct page *page; + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (cpu_buffer->free_page) { + bpage = cpu_buffer->free_page; + cpu_buffer->free_page = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + if (bpage) + goto out; + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) @@ -4387,6 +4404,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) bpage = page_address(page); + out: rb_init_page(bpage); return bpage; @@ -4396,13 +4414,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for + * @cpu: the cpu buffer the page came from * @data: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ -void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) +void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) { - free_page((unsigned long)data); + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct buffer_data_page *bpage = data; + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (!cpu_buffer->free_page) { + cpu_buffer->free_page = bpage; + bpage = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + free_page((unsigned long)bpage); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index c190a4d5013c..9fbcaf567886 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -171,7 +171,7 @@ static enum event_status read_page(int cpu) } } } - ring_buffer_free_read_page(buffer, bpage); + ring_buffer_free_read_page(buffer, cpu, bpage); if (ret < 0) return EVENT_DROPPED; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 60c904fa5480..5b645b0fbbb8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6054,6 +6054,7 @@ static int tracing_clock_open(struct inode *inode, struct file *file) struct ftrace_buffer_info { struct trace_iterator iter; void *spare; + unsigned int spare_cpu; unsigned int read; }; @@ -6383,9 +6384,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return -EBUSY; #endif - if (!info->spare) + if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); + info->spare_cpu = iter->cpu_file; + } if (!info->spare) return -ENOMEM; @@ -6445,7 +6448,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); if (info->spare) - ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); + ring_buffer_free_read_page(iter->trace_buffer->buffer, + info->spare_cpu, info->spare); kfree(info); mutex_unlock(&trace_types_lock); @@ -6456,6 +6460,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) struct buffer_ref { struct ring_buffer *buffer; void *page; + int cpu; int ref; }; @@ -6467,7 +6472,7 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, if (--ref->ref) return; - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); buf->private = 0; } @@ -6501,7 +6506,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) if (--ref->ref) return; - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); spd->partial[i].private = 0; } @@ -6566,11 +6571,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, kfree(ref); break; } + ref->cpu = iter->cpu_file; r = ring_buffer_read_page(ref->buffer, &ref->page, len, iter->cpu_file, 1); if (r < 0) { - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, + ref->page); kfree(ref); break; }