From 575570f02761bd680ba5731c1dfd4701062e7fb2 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 27 Jul 2010 16:06:34 +0800 Subject: [PATCH 1/5] tracing: Fix an unallocated memory access in function_graph With CONFIG_DEBUG_PAGEALLOC, I observed an unallocated memory access in function_graph trace. It appears we find a small size entry in ring buffer, but we access it as a big size entry. The access overflows the page size and touches an unallocated page. Cc: Signed-off-by: Shaohua Li LKML-Reference: <1280217994.32400.76.camel@sli10-desk.sh.intel.com> [ Added a comment to explain the problem - SDR ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 79f4bac99a94..b4c179ae4e45 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -507,7 +507,15 @@ get_return_for_leaf(struct trace_iterator *iter, * if the output fails. */ data->ent = *curr; - data->ret = *next; + /* + * If the next event is not a return type, then + * we only care about what type it is. Otherwise we can + * safely copy the entire event. + */ + if (next->ent.type == TRACE_GRAPH_RET) + data->ret = *next; + else + data->ret.ent.type = next->ent.type; } } From 18fab912d4fa70133df164d2dcf3310be0c38c34 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 28 Jul 2010 14:14:01 +0800 Subject: [PATCH 2/5] tracing: Fix ring_buffer_read_page reading out of page boundary With the configuration: CONFIG_DEBUG_PAGEALLOC=y and Shaohua's patch: [PATCH]x86: make spurious_fault check correct pte bit Function call graph trace with the following will trigger a page fault. # cd /sys/kernel/debug/tracing/ # echo function_graph > current_tracer # cat per_cpu/cpu1/trace_pipe_raw > /dev/null BUG: unable to handle kernel paging request at ffff880006e99000 IP: [] rb_event_length+0x1/0x3f PGD 1b19063 PUD 1b1d063 PMD 3f067 PTE 6e99160 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC last sysfs file: /sys/devices/virtual/net/lo/operstate CPU 1 Modules linked in: Pid: 1982, comm: cat Not tainted 2.6.35-rc6-aes+ #300 /Bochs RIP: 0010:[] [] rb_event_length+0x1/0x3f RSP: 0018:ffff880006475e38 EFLAGS: 00010006 RAX: 0000000000000ff0 RBX: ffff88000786c630 RCX: 000000000000001d RDX: ffff880006e98000 RSI: 0000000000000ff0 RDI: ffff880006e99000 RBP: ffff880006475eb8 R08: 000000145d7008bd R09: 0000000000000000 R10: 0000000000008000 R11: ffffffff815d9336 R12: ffff880006d08000 R13: ffff880006e605d8 R14: 0000000000000000 R15: 0000000000000018 FS: 00007f2b83e456f0(0000) GS:ffff880002100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff880006e99000 CR3: 00000000064a8000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process cat (pid: 1982, threadinfo ffff880006474000, task ffff880006e40770) Stack: ffff880006475eb8 ffffffff8108730f 0000000000000ff0 000000145d7008bd <0> ffff880006e98010 ffff880006d08010 0000000000000296 ffff88000786c640 <0> ffffffff81002956 0000000000000000 ffff8800071f4680 ffff8800071f4680 Call Trace: [] ? ring_buffer_read_page+0x15a/0x24a [] ? return_to_handler+0x15/0x2f [] tracing_buffers_read+0xb9/0x164 [] vfs_read+0xaf/0x150 [] return_to_handler+0x0/0x2f [] __bad_area_nosemaphore+0x17e/0x1a1 [] return_to_handler+0x0/0x2f [] bad_area_nosemaphore+0x13/0x15 Code: 80 25 b2 16 b3 00 fe c9 c3 55 48 89 e5 f0 80 0d a4 16 b3 00 02 c9 c3 55 31 c0 48 89 e5 48 83 3d 94 16 b3 00 01 c9 0f 94 c0 c3 55 <8a> 0f 48 89 e5 83 e1 1f b8 08 00 00 00 0f b6 d1 83 fa 1e 74 27 RIP [] rb_event_length+0x1/0x3f RSP CR2: ffff880006e99000 ---[ end trace a6877bb92ccb36bb ]--- The root cause is that ring_buffer_read_page() may read out of page boundary, because the boundary checking is done after reading. This is fixed via doing boundary checking before reading. Reported-by: Shaohua Li Cc: Signed-off-by: Huang Ying LKML-Reference: <1280297641.2771.307.camel@yhuang-dev> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1da7b6ea8b85..5ec8f1d1480e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3868,6 +3868,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, rpos = reader->read; pos += size; + if (rpos >= commit) + break; + event = rb_reader_event(cpu_buffer); size = rb_event_length(event); } while (len > size); From 465c6cca2668a2db2a4ffce3dca5714017873f2b Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 6 Aug 2010 03:26:24 -0400 Subject: [PATCH 3/5] tracing: Extend recordmcount to better support Blackfin mcount The mcount call on Blackfin systems includes some stack manipulation around the actual call site, so extend the build time perl script to support this. This way we can avoid doing the calculation at runtime. Signed-off-by: Mike Frysinger LKML-Reference: <1281079584-21205-1-git-send-email-vapier@gentoo.org> Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index f3c9c0a90b98..2c56539e5da2 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -159,6 +159,7 @@ my $section_regex; # Find the start of a section my $function_regex; # Find the name of a function # (return offset and func name) my $mcount_regex; # Find the call site to mcount (return offset) +my $mcount_adjust; # Address adjustment to mcount offset my $alignment; # The .align value to use for $mcount_section my $section_type; # Section header plus possible alignment command my $can_use_local = 0; # If we can use local function references @@ -213,6 +214,7 @@ $section_regex = "Disassembly of section\\s+(\\S+):"; $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; $section_type = '@progbits'; +$mcount_adjust = 0; $type = ".long"; if ($arch eq "x86_64") { @@ -351,6 +353,9 @@ if ($arch eq "x86_64") { } elsif ($arch eq "microblaze") { # Microblaze calls '_mcount' instead of plain 'mcount'. $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; +} elsif ($arch eq "blackfin") { + $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s__mcount\$"; + $mcount_adjust = -4; } else { die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; } @@ -511,7 +516,7 @@ while () { } # is this a call site to mcount? If so, record it to print later if ($text_found && /$mcount_regex/) { - push(@offsets, hex $1); + push(@offsets, (hex $1) + $mcount_adjust); } } From 2a37a3df57c44e947271758a1aa4bea7bff9feab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Jun 2010 15:21:34 -0400 Subject: [PATCH 4/5] tracing/events: Convert format output to seq_file Two new events were added that broke the current format output. Both from the SCSI system: scsi_dispatch_cmd_done and scsi_dispatch_cmd_timeout The reason is that their print_fmt exceeded a page size. Since the output of the format used simple_read_from_buffer and trace_seq, it was limited to a page size in output. This patch converts the printing of the format of an event into seq_file, which allows greater than a page size to be shown. I diffed all event formats comparing the output with and without this patch. All matched except for the above two, which showed just: FORMAT TOO BIG without this patch, but now properly displays the output with this patch. v2: Remove updating *pos in seq start function. [ Thanks to Li Zefan for pointing that out ] Reviewed-by: Li Zefan Cc: Martin K. Petersen Cc: Kei Tokunaga Cc: James Bottomley Cc: Tomohiro Kusumi Cc: Xiao Guangrong Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 206 ++++++++++++++++++++++++------------ 1 file changed, 140 insertions(+), 66 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53cffc0b0801..45a8968707aa 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -29,6 +29,8 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +#define COMMON_FIELD_COUNT 5 + struct list_head * trace_get_fields(struct ftrace_event_call *event_call) { @@ -544,85 +546,155 @@ out: return ret; } -static ssize_t -event_format_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +enum { + FORMAT_HEADER = 1, + FORMAT_PRINTFMT = 2, +}; + +static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_call *call = m->private; struct ftrace_event_field *field; struct list_head *head; - struct trace_seq *s; - int common_field_count = 5; - char *buf; - int r = 0; + loff_t index = *pos; - if (*ppos) - return 0; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - trace_seq_init(s); - - trace_seq_printf(s, "name: %s\n", call->name); - trace_seq_printf(s, "ID: %d\n", call->event.type); - trace_seq_printf(s, "format:\n"); + (*pos)++; head = trace_get_fields(call); - list_for_each_entry_reverse(field, head, link) { - /* - * Smartly shows the array type(except dynamic array). - * Normal: - * field:TYPE VAR - * If TYPE := TYPE[LEN], it is shown: - * field:TYPE VAR[LEN] - */ - const char *array_descriptor = strchr(field->type, '['); - if (!strncmp(field->type, "__data_loc", 10)) - array_descriptor = NULL; + switch ((unsigned long)v) { + case FORMAT_HEADER: - if (!array_descriptor) { - r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" - "\tsize:%u;\tsigned:%d;\n", - field->type, field->name, field->offset, - field->size, !!field->is_signed); - } else { - r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" - "\tsize:%u;\tsigned:%d;\n", - (int)(array_descriptor - field->type), - field->type, field->name, - array_descriptor, field->offset, - field->size, !!field->is_signed); - } + if (unlikely(list_empty(head))) + return NULL; - if (--common_field_count == 0) - r = trace_seq_printf(s, "\n"); + field = list_entry(head->prev, struct ftrace_event_field, link); + return field; - if (!r) - break; + case FORMAT_PRINTFMT: + /* all done */ + return NULL; } - if (r) - r = trace_seq_printf(s, "\nprint fmt: %s\n", - call->print_fmt); + /* + * To separate common fields from event fields, the + * LSB is set on the first event field. Clear it in case. + */ + v = (void *)((unsigned long)v & ~1L); - if (!r) { - /* - * ug! The format output is bigger than a PAGE!! - */ - buf = "FORMAT TOO BIG\n"; - r = simple_read_from_buffer(ubuf, cnt, ppos, - buf, strlen(buf)); - goto out; + field = v; + if (field->link.prev == head) + return (void *)FORMAT_PRINTFMT; + + field = list_entry(field->link.prev, struct ftrace_event_field, link); + + /* Set the LSB to notify f_show to print an extra newline */ + if (index == COMMON_FIELD_COUNT) + field = (struct ftrace_event_field *) + ((unsigned long)field | 1); + + return field; +} + +static void *f_start(struct seq_file *m, loff_t *pos) +{ + loff_t l = 0; + void *p; + + /* Start by showing the header */ + if (!*pos) + return (void *)FORMAT_HEADER; + + p = (void *)FORMAT_HEADER; + do { + p = f_next(m, p, &l); + } while (p && l < *pos); + + return p; +} + +static int f_show(struct seq_file *m, void *v) +{ + struct ftrace_event_call *call = m->private; + struct ftrace_event_field *field; + const char *array_descriptor; + + switch ((unsigned long)v) { + case FORMAT_HEADER: + seq_printf(m, "name: %s\n", call->name); + seq_printf(m, "ID: %d\n", call->event.type); + seq_printf(m, "format:\n"); + return 0; + + case FORMAT_PRINTFMT: + seq_printf(m, "\nprint fmt: %s\n", + call->print_fmt); + return 0; } - r = simple_read_from_buffer(ubuf, cnt, ppos, - s->buffer, s->len); - out: - kfree(s); - return r; + /* + * To separate common fields from event fields, the + * LSB is set on the first event field. Clear it and + * print a newline if it is set. + */ + if ((unsigned long)v & 1) { + seq_putc(m, '\n'); + v = (void *)((unsigned long)v & ~1L); + } + + field = v; + + /* + * Smartly shows the array type(except dynamic array). + * Normal: + * field:TYPE VAR + * If TYPE := TYPE[LEN], it is shown: + * field:TYPE VAR[LEN] + */ + array_descriptor = strchr(field->type, '['); + + if (!strncmp(field->type, "__data_loc", 10)) + array_descriptor = NULL; + + if (!array_descriptor) + seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, field->offset, + field->size, !!field->is_signed); + else + seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + array_descriptor, field->offset, + field->size, !!field->is_signed); + + return 0; +} + +static void f_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations trace_format_seq_ops = { + .start = f_start, + .next = f_next, + .stop = f_stop, + .show = f_show, +}; + +static int trace_format_open(struct inode *inode, struct file *file) +{ + struct ftrace_event_call *call = inode->i_private; + struct seq_file *m; + int ret; + + ret = seq_open(file, &trace_format_seq_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = call; + + return 0; } static ssize_t @@ -820,8 +892,10 @@ static const struct file_operations ftrace_enable_fops = { }; static const struct file_operations ftrace_event_format_fops = { - .open = tracing_open_generic, - .read = event_format_read, + .open = trace_format_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; static const struct file_operations ftrace_event_id_fops = { From 1aa54bca6ee0d07ebcafb8ca8074b624d80724aa Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Wed, 28 Jul 2010 01:18:01 +0200 Subject: [PATCH 5/5] tracing: Sanitize value returned from write(trace_marker, "...", len) When userspace code writes non-new-line-terminated string to trace_marker file, write handler appends new-line and returns number of bytes written to trace buffer, so write(fd, "abc", 3) will return 4 That's unexpected and unfortunately it confuses glibc's fprintf function. Example: int main() { fprintf(stderr, "abc"); return 0; } $ gcc test.c -o test $ echo mmiotrace > /sys/kernel/debug/tracing/current_tracer $ ./test 2>/sys/kernel/debug/tracing/trace_marker results in infinite loop: write(fd, "abc", 3) = 4 write(fd, "", 1) = 0 write(fd, "", 1) = 0 write(fd, "", 1) = 0 write(fd, "", 1) = 0 write(fd, "", 1) = 0 write(fd, "", 1) = 0 write(fd, "", 1) = 0 (...) ...and kernel trace buffer full of empty markers. Fix it by sanitizing write return value. Signed-off-by: Marcin Slusarz LKML-Reference: <20100727231801.GB2826@joi.lan> Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 086d36316805..88b42d14d32d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3498,6 +3498,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { char *buf; + size_t written; if (tracing_disabled) return -EINVAL; @@ -3519,11 +3520,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else buf[cnt] = '\0'; - cnt = mark_printk("%s", buf); + written = mark_printk("%s", buf); kfree(buf); - *fpos += cnt; + *fpos += written; - return cnt; + /* don't tell userspace we wrote more - it might confuse them */ + if (written > cnt) + written = cnt; + + return written; } static int tracing_clock_show(struct seq_file *m, void *v)