From 7162431dcf72032835d369c8d7b51311df407938 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 16 Oct 2019 13:33:13 +0200 Subject: [PATCH 01/51] ftrace: Introduce PERMANENT ftrace_ops flag Livepatch uses ftrace for redirection to new patched functions. It means that if ftrace is disabled, all live patched functions are disabled as well. Toggling global 'ftrace_enabled' sysctl thus affect it directly. It is not a problem per se, because only administrator can set sysctl values, but it still may be surprising. Introduce PERMANENT ftrace_ops flag to amend this. If the FTRACE_OPS_FL_PERMANENT is set on any ftrace ops, the tracing cannot be disabled by disabling ftrace_enabled. Equally, a callback with the flag set cannot be registered if ftrace_enabled is disabled. Link: http://lkml.kernel.org/r/20191016113316.13415-2-mbenes@suse.cz Reviewed-by: Petr Mladek Reviewed-by: Kamalesh Babulal Signed-off-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace-uses.rst | 8 ++++++++ Documentation/trace/ftrace.rst | 4 +++- include/linux/ftrace.h | 3 +++ kernel/livepatch/patch.c | 3 ++- kernel/trace/ftrace.c | 23 +++++++++++++++++++++-- 5 files changed, 37 insertions(+), 4 deletions(-) diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index 1fbc69894eed..740bd0224d35 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -170,6 +170,14 @@ FTRACE_OPS_FL_RCU a callback may be executed and RCU synchronization will not protect it. +FTRACE_OPS_FL_PERMANENT + If this is set on any ftrace ops, then the tracing cannot disabled by + writing 0 to the proc sysctl ftrace_enabled. Equally, a callback with + the flag set cannot be registered if ftrace_enabled is 0. + + Livepatch uses it not to lose the function redirection, so the system + stays protected. + Filtering which functions to trace ================================== diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index e3060eedb22d..d2b5657ed33e 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -2976,7 +2976,9 @@ Note, the proc sysctl ftrace_enable is a big on/off switch for the function tracer. By default it is enabled (when function tracing is enabled in the kernel). If it is disabled, all function tracing is disabled. This includes not only the function tracers for ftrace, but -also for any other uses (perf, kprobes, stack tracing, profiling, etc). +also for any other uses (perf, kprobes, stack tracing, profiling, etc). It +cannot be disabled if there is a callback with FTRACE_OPS_FL_PERMANENT set +registered. Please disable this with care. diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8a8cb3c401b2..8385cafe4f9f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -142,6 +142,8 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * PID - Is affected by set_ftrace_pid (allows filtering on those pids) * RCU - Set when the ops can only be called when RCU is watching. * TRACE_ARRAY - The ops->private points to a trace_array descriptor. + * PERMANENT - Set when the ops is permanent and should not be affected by + * ftrace_enabled. */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -160,6 +162,7 @@ enum { FTRACE_OPS_FL_PID = 1 << 13, FTRACE_OPS_FL_RCU = 1 << 14, FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, + FTRACE_OPS_FL_PERMANENT = 1 << 16, }; #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index bd43537702bd..b552cf2d85f8 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -196,7 +196,8 @@ static int klp_patch_func(struct klp_func *func) ops->fops.func = klp_ftrace_handler; ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC | - FTRACE_OPS_FL_IPMODIFY; + FTRACE_OPS_FL_IPMODIFY | + FTRACE_OPS_FL_PERMANENT; list_add(&ops->node, &klp_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f296d89be757..89e9128652ef 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -326,6 +326,8 @@ int __register_ftrace_function(struct ftrace_ops *ops) if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) ops->flags |= FTRACE_OPS_FL_SAVE_REGS; #endif + if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT)) + return -EBUSY; if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; @@ -6754,6 +6756,18 @@ int unregister_ftrace_function(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(unregister_ftrace_function); +static bool is_permanent_ops_registered(void) +{ + struct ftrace_ops *op; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->flags & FTRACE_OPS_FL_PERMANENT) + return true; + } while_for_each_ftrace_op(op); + + return false; +} + int ftrace_enable_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -6771,8 +6785,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; - last_ftrace_enabled = !!ftrace_enabled; - if (ftrace_enabled) { /* we are starting ftrace again */ @@ -6783,12 +6795,19 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); } else { + if (is_permanent_ops_registered()) { + ftrace_enabled = true; + ret = -EBUSY; + goto out; + } + /* stopping ftrace calls (just send to ftrace_stub) */ ftrace_trace_function = ftrace_stub; ftrace_shutdown_sysctl(); } + last_ftrace_enabled = !!ftrace_enabled; out: mutex_unlock(&ftrace_lock); return ret; From 35c9e74cff4c798d0a07830d0d483e6ebe70352f Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Wed, 16 Oct 2019 13:33:14 +0200 Subject: [PATCH 02/51] selftests/livepatch: Make dynamic debug setup and restore generic Livepatch selftests currently save the current dynamic debug config and tweak it for the selftests. The config is restored at the end. Make the infrastructure generic, so that more variables can be saved and restored. Link: http://lkml.kernel.org/r/20191016113316.13415-3-mbenes@suse.cz Signed-off-by: Joe Lawrence Signed-off-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- .../testing/selftests/livepatch/functions.sh | 22 +++++++++++-------- .../selftests/livepatch/test-callbacks.sh | 2 +- .../selftests/livepatch/test-livepatch.sh | 2 +- .../selftests/livepatch/test-shadow-vars.sh | 2 +- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh index 79b0affd21fb..b7e5a67ae434 100644 --- a/tools/testing/selftests/livepatch/functions.sh +++ b/tools/testing/selftests/livepatch/functions.sh @@ -29,29 +29,33 @@ function die() { exit 1 } -function push_dynamic_debug() { - DYNAMIC_DEBUG=$(grep '^kernel/livepatch' /sys/kernel/debug/dynamic_debug/control | \ - awk -F'[: ]' '{print "file " $1 " line " $2 " " $4}') +function push_config() { + DYNAMIC_DEBUG=$(grep '^kernel/livepatch' /sys/kernel/debug/dynamic_debug/control | \ + awk -F'[: ]' '{print "file " $1 " line " $2 " " $4}') } -function pop_dynamic_debug() { +function pop_config() { if [[ -n "$DYNAMIC_DEBUG" ]]; then echo -n "$DYNAMIC_DEBUG" > /sys/kernel/debug/dynamic_debug/control fi } -# set_dynamic_debug() - save the current dynamic debug config and tweak -# it for the self-tests. Set a script exit trap -# that restores the original config. function set_dynamic_debug() { - push_dynamic_debug - trap pop_dynamic_debug EXIT INT TERM HUP cat <<-EOF > /sys/kernel/debug/dynamic_debug/control file kernel/livepatch/* +p func klp_try_switch_task -p EOF } +# setup_config - save the current config and set a script exit trap that +# restores the original config. Setup the dynamic debug +# for verbose livepatching output. +function setup_config() { + push_config + set_dynamic_debug + trap pop_config EXIT INT TERM HUP +} + # loop_until(cmd) - loop a command until it is successful or $MAX_RETRIES, # sleep $RETRY_INTERVAL between attempts # cmd - command and its arguments to run diff --git a/tools/testing/selftests/livepatch/test-callbacks.sh b/tools/testing/selftests/livepatch/test-callbacks.sh index e97a9dcb73c7..a35289b13c9c 100755 --- a/tools/testing/selftests/livepatch/test-callbacks.sh +++ b/tools/testing/selftests/livepatch/test-callbacks.sh @@ -9,7 +9,7 @@ MOD_LIVEPATCH2=test_klp_callbacks_demo2 MOD_TARGET=test_klp_callbacks_mod MOD_TARGET_BUSY=test_klp_callbacks_busy -set_dynamic_debug +setup_config # TEST: target module before livepatch diff --git a/tools/testing/selftests/livepatch/test-livepatch.sh b/tools/testing/selftests/livepatch/test-livepatch.sh index f05268aea859..493e3df415a1 100755 --- a/tools/testing/selftests/livepatch/test-livepatch.sh +++ b/tools/testing/selftests/livepatch/test-livepatch.sh @@ -7,7 +7,7 @@ MOD_LIVEPATCH=test_klp_livepatch MOD_REPLACE=test_klp_atomic_replace -set_dynamic_debug +setup_config # TEST: basic function patching diff --git a/tools/testing/selftests/livepatch/test-shadow-vars.sh b/tools/testing/selftests/livepatch/test-shadow-vars.sh index 04a37831e204..1aae73299114 100755 --- a/tools/testing/selftests/livepatch/test-shadow-vars.sh +++ b/tools/testing/selftests/livepatch/test-shadow-vars.sh @@ -6,7 +6,7 @@ MOD_TEST=test_klp_shadow_vars -set_dynamic_debug +setup_config # TEST: basic shadow variable API From 8c666d2ab5762280cb5ef43df4decb2a25450d54 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Wed, 16 Oct 2019 13:33:15 +0200 Subject: [PATCH 03/51] selftests/livepatch: Test interaction with ftrace_enabled Since livepatching depends upon ftrace handlers to implement "patched" code functionality, verify that the ftrace_enabled sysctl value interacts with livepatch registration as expected. At the same time, ensure that ftrace_enabled is set and part of the test environment configuration that is saved and restored when running the selftests. Link: http://lkml.kernel.org/r/20191016113316.13415-4-mbenes@suse.cz Signed-off-by: Joe Lawrence Signed-off-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/livepatch/Makefile | 3 +- .../testing/selftests/livepatch/functions.sh | 14 +++- .../selftests/livepatch/test-ftrace.sh | 65 +++++++++++++++++++ 3 files changed, 80 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/livepatch/test-ftrace.sh diff --git a/tools/testing/selftests/livepatch/Makefile b/tools/testing/selftests/livepatch/Makefile index fd405402c3ff..1886d9d94b88 100644 --- a/tools/testing/selftests/livepatch/Makefile +++ b/tools/testing/selftests/livepatch/Makefile @@ -4,6 +4,7 @@ TEST_PROGS_EXTENDED := functions.sh TEST_PROGS := \ test-livepatch.sh \ test-callbacks.sh \ - test-shadow-vars.sh + test-shadow-vars.sh \ + test-ftrace.sh include ../lib.mk diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh index b7e5a67ae434..31eb09e38729 100644 --- a/tools/testing/selftests/livepatch/functions.sh +++ b/tools/testing/selftests/livepatch/functions.sh @@ -32,12 +32,16 @@ function die() { function push_config() { DYNAMIC_DEBUG=$(grep '^kernel/livepatch' /sys/kernel/debug/dynamic_debug/control | \ awk -F'[: ]' '{print "file " $1 " line " $2 " " $4}') + FTRACE_ENABLED=$(sysctl --values kernel.ftrace_enabled) } function pop_config() { if [[ -n "$DYNAMIC_DEBUG" ]]; then echo -n "$DYNAMIC_DEBUG" > /sys/kernel/debug/dynamic_debug/control fi + if [[ -n "$FTRACE_ENABLED" ]]; then + sysctl kernel.ftrace_enabled="$FTRACE_ENABLED" &> /dev/null + fi } function set_dynamic_debug() { @@ -47,12 +51,20 @@ function set_dynamic_debug() { EOF } +function set_ftrace_enabled() { + local sysctl="$1" + result=$(sysctl kernel.ftrace_enabled="$1" 2>&1 | paste --serial --delimiters=' ') + echo "livepatch: $result" > /dev/kmsg +} + # setup_config - save the current config and set a script exit trap that # restores the original config. Setup the dynamic debug -# for verbose livepatching output. +# for verbose livepatching output and turn on +# the ftrace_enabled sysctl. function setup_config() { push_config set_dynamic_debug + set_ftrace_enabled 1 trap pop_config EXIT INT TERM HUP } diff --git a/tools/testing/selftests/livepatch/test-ftrace.sh b/tools/testing/selftests/livepatch/test-ftrace.sh new file mode 100755 index 000000000000..e2a76887f40a --- /dev/null +++ b/tools/testing/selftests/livepatch/test-ftrace.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2019 Joe Lawrence + +. $(dirname $0)/functions.sh + +MOD_LIVEPATCH=test_klp_livepatch + +setup_config + + +# TEST: livepatch interaction with ftrace_enabled sysctl +# - turn ftrace_enabled OFF and verify livepatches can't load +# - turn ftrace_enabled ON and verify livepatch can load +# - verify that ftrace_enabled can't be turned OFF while a livepatch is loaded + +echo -n "TEST: livepatch interaction with ftrace_enabled sysctl ... " +dmesg -C + +set_ftrace_enabled 0 +load_failing_mod $MOD_LIVEPATCH + +set_ftrace_enabled 1 +load_lp $MOD_LIVEPATCH +if [[ "$(cat /proc/cmdline)" != "$MOD_LIVEPATCH: this has been live patched" ]] ; then + echo -e "FAIL\n\n" + die "livepatch kselftest(s) failed" +fi + +set_ftrace_enabled 0 +if [[ "$(cat /proc/cmdline)" != "$MOD_LIVEPATCH: this has been live patched" ]] ; then + echo -e "FAIL\n\n" + die "livepatch kselftest(s) failed" +fi +disable_lp $MOD_LIVEPATCH +unload_lp $MOD_LIVEPATCH + +check_result "livepatch: kernel.ftrace_enabled = 0 +% modprobe $MOD_LIVEPATCH +livepatch: enabling patch '$MOD_LIVEPATCH' +livepatch: '$MOD_LIVEPATCH': initializing patching transition +livepatch: failed to register ftrace handler for function 'cmdline_proc_show' (-16) +livepatch: failed to patch object 'vmlinux' +livepatch: failed to enable patch '$MOD_LIVEPATCH' +livepatch: '$MOD_LIVEPATCH': canceling patching transition, going to unpatch +livepatch: '$MOD_LIVEPATCH': completing unpatching transition +livepatch: '$MOD_LIVEPATCH': unpatching complete +modprobe: ERROR: could not insert '$MOD_LIVEPATCH': Device or resource busy +livepatch: kernel.ftrace_enabled = 1 +% modprobe $MOD_LIVEPATCH +livepatch: enabling patch '$MOD_LIVEPATCH' +livepatch: '$MOD_LIVEPATCH': initializing patching transition +livepatch: '$MOD_LIVEPATCH': starting patching transition +livepatch: '$MOD_LIVEPATCH': completing patching transition +livepatch: '$MOD_LIVEPATCH': patching complete +livepatch: sysctl: setting key \"kernel.ftrace_enabled\": Device or resource busy kernel.ftrace_enabled = 0 +% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled +livepatch: '$MOD_LIVEPATCH': initializing unpatching transition +livepatch: '$MOD_LIVEPATCH': starting unpatching transition +livepatch: '$MOD_LIVEPATCH': completing unpatching transition +livepatch: '$MOD_LIVEPATCH': unpatching complete +% rmmod $MOD_LIVEPATCH" + + +exit 0 From 714641c3670cdc75371a7ff5bdfd5e9a170c7ffd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 12:25:46 -0500 Subject: [PATCH 04/51] ftrace: Separate out the copying of a ftrace_hash from __ftrace_hash_move() Most of the functionality of __ftrace_hash_move() can be reused, but not all of it. That is, __ftrace_hash_move() is used to simply make a new hash from an existing one, using the same size as the original. Creating a dup_hash(), where we can specify a new size will be useful when we want to create a hash with a default size, or simply copy the old one. Signed-off-by: Steven Rostedt (VMWare) --- kernel/trace/ftrace.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 89e9128652ef..76e5de8c7822 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1372,23 +1372,15 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, struct ftrace_hash *new_hash); -static struct ftrace_hash * -__ftrace_hash_move(struct ftrace_hash *src) +static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) { struct ftrace_func_entry *entry; - struct hlist_node *tn; - struct hlist_head *hhd; struct ftrace_hash *new_hash; - int size = src->count; + struct hlist_head *hhd; + struct hlist_node *tn; int bits = 0; int i; - /* - * If the new source is empty, just return the empty_hash. - */ - if (ftrace_hash_empty(src)) - return EMPTY_HASH; - /* * Make the hash size about 1/2 the # found */ @@ -1413,10 +1405,23 @@ __ftrace_hash_move(struct ftrace_hash *src) __add_hash_entry(new_hash, entry); } } - return new_hash; } +static struct ftrace_hash * +__ftrace_hash_move(struct ftrace_hash *src) +{ + int size = src->count; + + /* + * If the new source is empty, just return the empty_hash. + */ + if (ftrace_hash_empty(src)) + return EMPTY_HASH; + + return dup_hash(src, size); +} + static int ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash **dst, struct ftrace_hash *src) From 7e16f581a81759bafea04d049134b32d1a881226 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 12:26:46 -0500 Subject: [PATCH 05/51] ftrace: Separate out functionality from ftrace_location_range() Create a new function called lookup_rec() from the functionality of ftrace_location_range(). The difference between lookup_rec() is that it returns the record that it finds, where as ftrace_location_range() returns only if it found a match or not. The lookup_rec() is static, and can be used for new functionality where ftrace needs to find a record of a specific address. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 76e5de8c7822..b0e7f03919de 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1541,6 +1541,26 @@ static int ftrace_cmp_recs(const void *a, const void *b) return 0; } +static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec = NULL; + struct dyn_ftrace key; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + } + return rec; +} + /** * ftrace_location_range - return the first address of a traced location * if it touches the given ip range @@ -1555,23 +1575,11 @@ static int ftrace_cmp_recs(const void *a, const void *b) */ unsigned long ftrace_location_range(unsigned long start, unsigned long end) { - struct ftrace_page *pg; struct dyn_ftrace *rec; - struct dyn_ftrace key; - key.ip = start; - key.flags = end; /* overload flags, as it is unsigned long */ - - for (pg = ftrace_pages_start; pg; pg = pg->next) { - if (end < pg->records[0].ip || - start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) - continue; - rec = bsearch(&key, pg->records, pg->index, - sizeof(struct dyn_ftrace), - ftrace_cmp_recs); - if (rec) - return rec->ip; - } + rec = lookup_rec(start, end); + if (rec) + return rec->ip; return 0; } From 763e34e74bb7d5c316015e2e39fcc8520bfd071c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:07:06 -0500 Subject: [PATCH 06/51] ftrace: Add register_ftrace_direct() Add the start of the functionality to allow other trampolines to use the ftrace mcount/fentry/nop location. This adds two new functions: register_ftrace_direct() and unregister_ftrace_direct() Both take two parameters: the first is the instruction address of where the mcount/fentry/nop exists, and the second is the trampoline to have that location called. This will handle cases where ftrace is already used on that same location, and will make it still work, where the registered direct called trampoline will get called after all the registered ftrace callers are handled. Currently, it will not allow for IP_MODIFY functions to be called at the same locations, which include some kprobes and live kernel patching. At this point, no architecture supports this. This is only the start of implementing the framework. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 36 +++++- kernel/trace/Kconfig | 8 ++ kernel/trace/ftrace.c | 269 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 306 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8385cafe4f9f..efe3e521aff4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -144,6 +144,8 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * TRACE_ARRAY - The ops->private points to a trace_array descriptor. * PERMANENT - Set when the ops is permanent and should not be affected by * ftrace_enabled. + * DIRECT - Used by the direct ftrace_ops helper for direct functions + * (internal ftrace only, should not be used by others) */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -163,6 +165,7 @@ enum { FTRACE_OPS_FL_RCU = 1 << 14, FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, FTRACE_OPS_FL_PERMANENT = 1 << 16, + FTRACE_OPS_FL_DIRECT = 1 << 17, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -242,6 +245,32 @@ static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +int register_ftrace_direct(unsigned long ip, unsigned long addr); +int unregister_ftrace_direct(unsigned long ip, unsigned long addr); +#else +static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) +{ + return -ENODEV; +} +static inline int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + return -ENODEV; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * This must be implemented by the architecture. + * It is the way the ftrace direct_ops helper, when called + * via ftrace (because there's other callbacks besides the + * direct call), can inform the architecture's trampoline that this + * routine has a direct caller, and what the caller is. + */ +static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, + unsigned long addr) { } +#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + #ifdef CONFIG_STACK_TRACER extern int stack_tracer_enabled; @@ -333,6 +362,7 @@ bool is_ftrace_trampoline(unsigned long addr); * REGS_EN - the function is set up to save regs. * IPMODIFY - the record allows for the IP address to be changed. * DISABLED - the record is not ready to be touched yet + * DIRECT - there is a direct function to call * * When a new ftrace_ops is registered and wants a function to save * pt_regs, the rec->flag REGS is set. When the function has been @@ -348,10 +378,12 @@ enum { FTRACE_FL_TRAMP_EN = (1UL << 27), FTRACE_FL_IPMODIFY = (1UL << 26), FTRACE_FL_DISABLED = (1UL << 25), + FTRACE_FL_DIRECT = (1UL << 24), + FTRACE_FL_DIRECT_EN = (1UL << 23), }; -#define FTRACE_REF_MAX_SHIFT 25 -#define FTRACE_FL_BITS 7 +#define FTRACE_REF_MAX_SHIFT 23 +#define FTRACE_FL_BITS 9 #define FTRACE_FL_MASKED_BITS ((1UL << FTRACE_FL_BITS) - 1) #define FTRACE_FL_MASK (FTRACE_FL_MASKED_BITS << FTRACE_REF_MAX_SHIFT) #define FTRACE_REF_MAX ((1UL << FTRACE_REF_MAX_SHIFT) - 1) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e08527f50d2a..624a05e99b0b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -33,6 +33,9 @@ config HAVE_DYNAMIC_FTRACE config HAVE_DYNAMIC_FTRACE_WITH_REGS bool +config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + bool + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -557,6 +560,11 @@ config DYNAMIC_FTRACE_WITH_REGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_REGS +config DYNAMIC_FTRACE_WITH_DIRECT_CALLS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b0e7f03919de..329a3f3789a1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1023,6 +1023,7 @@ static bool update_all_ops; struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; + unsigned long direct; /* for direct lookup only */ }; struct ftrace_func_probe { @@ -1730,6 +1731,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) return false; + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT; + /* * If there's only a single callback registered to a * function, and the ops has a trampoline registered @@ -1757,6 +1761,15 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, return false; rec->flags--; + /* + * Only the internal direct_ops should have the + * DIRECT flag set. Thus, if it is removing a + * function, then that function should no longer + * be direct. + */ + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags &= ~FTRACE_FL_DIRECT; + /* * If the rec had REGS enabled and the ops that is * being removed had REGS set, then see if there is @@ -2092,15 +2105,34 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) * If enabling and the REGS flag does not match the REGS_EN, or * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore * this record. Set flags to fail the compare against ENABLED. + * Same for direct calls. */ if (flag) { - if (!(rec->flags & FTRACE_FL_REGS) != + if (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)) flag |= FTRACE_FL_REGS; - if (!(rec->flags & FTRACE_FL_TRAMP) != + if (!(rec->flags & FTRACE_FL_TRAMP) != !(rec->flags & FTRACE_FL_TRAMP_EN)) flag |= FTRACE_FL_TRAMP; + + /* + * Direct calls are special, as count matters. + * We must test the record for direct, if the + * DIRECT and DIRECT_EN do not match, but only + * if the count is 1. That's because, if the + * count is something other than one, we do not + * want the direct enabled (it will be done via the + * direct helper). But if DIRECT_EN is set, and + * the count is not one, we need to clear it. + */ + if (ftrace_rec_count(rec) == 1) { + if (!(rec->flags & FTRACE_FL_DIRECT) != + !(rec->flags & FTRACE_FL_DIRECT_EN)) + flag |= FTRACE_FL_DIRECT; + } else if (rec->flags & FTRACE_FL_DIRECT_EN) { + flag |= FTRACE_FL_DIRECT; + } } /* If the state of this record hasn't changed, then do nothing */ @@ -2125,6 +2157,25 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) else rec->flags &= ~FTRACE_FL_TRAMP_EN; } + if (flag & FTRACE_FL_DIRECT) { + /* + * If there's only one user (direct_ops helper) + * then we can call the direct function + * directly (no ftrace trampoline). + */ + if (ftrace_rec_count(rec) == 1) { + if (rec->flags & FTRACE_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT_EN; + else + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } else { + /* + * Can only call directly if there's + * only one callback to the function. + */ + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } + } } /* @@ -2154,7 +2205,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) * and REGS states. The _EN flags must be disabled though. */ rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | - FTRACE_FL_REGS_EN); + FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN); } ftrace_bug_type = FTRACE_BUG_NOP; @@ -2309,6 +2360,51 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) return NULL; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* Protected by rcu_tasks for reading, and direct_mutex for writing */ +static struct ftrace_hash *direct_functions = EMPTY_HASH; +static DEFINE_MUTEX(direct_mutex); + +/* + * Search the direct_functions hash to see if the given instruction pointer + * has a direct caller attached to it. + */ +static unsigned long find_rec_direct(unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) + return 0; + + return entry->direct; +} + +static void call_direct_funcs(unsigned long ip, unsigned long pip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + unsigned long addr; + + addr = find_rec_direct(ip); + if (!addr) + return; + + arch_ftrace_set_direct_caller(regs, addr); +} + +struct ftrace_ops direct_ops = { + .func = call_direct_funcs, + .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE + | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS + | FTRACE_OPS_FL_PERMANENT, +}; +#else +static inline unsigned long find_rec_direct(unsigned long ip) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + /** * ftrace_get_addr_new - Get the call address to set to * @rec: The ftrace record descriptor @@ -2322,6 +2418,15 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { struct ftrace_ops *ops; + unsigned long addr; + + if ((rec->flags & FTRACE_FL_DIRECT) && + (ftrace_rec_count(rec) == 1)) { + addr = find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } /* Trampolines take precedence over regs */ if (rec->flags & FTRACE_FL_TRAMP) { @@ -2354,6 +2459,15 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { struct ftrace_ops *ops; + unsigned long addr; + + /* Direct calls take precedence over trampolines */ + if (rec->flags & FTRACE_FL_DIRECT_EN) { + addr = find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } /* Trampolines take precedence over regs */ if (rec->flags & FTRACE_FL_TRAMP_EN) { @@ -3465,10 +3579,11 @@ static int t_show(struct seq_file *m, void *v) if (iter->flags & FTRACE_ITER_ENABLED) { struct ftrace_ops *ops; - seq_printf(m, " (%ld)%s%s", + seq_printf(m, " (%ld)%s%s%s", ftrace_rec_count(rec), rec->flags & FTRACE_FL_REGS ? " R" : " ", - rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); + rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ", + rec->flags & FTRACE_FL_DIRECT ? " D" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); if (ops) { @@ -3484,6 +3599,13 @@ static int t_show(struct seq_file *m, void *v) } else { add_trampoline_func(m, NULL, rec); } + if (rec->flags & FTRACE_FL_DIRECT) { + unsigned long direct; + + direct = find_rec_direct(rec->ip); + if (direct) + seq_printf(m, "\n\tdirect-->%pS", (void *)direct); + } } seq_putc(m, '\n'); @@ -4815,6 +4937,143 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/** + * register_ftrace_direct - Call a custom trampoline directly + * @ip: The address of the nop at the beginning of a function + * @addr: The address of the trampoline to call at @ip + * + * This is used to connect a direct call from the nop location (@ip) + * at the start of ftrace traced functions. The location that it calls + * (@addr) must be able to handle a direct call, and save the parameters + * of the function being traced, and restore them (or inject new ones + * if needed), before returning. + * + * Returns: + * 0 on success + * -EBUSY - Another direct function is already attached (there can be only one) + * -ENODEV - @ip does not point to a ftrace nop location (or not supported) + * -ENOMEM - There was an allocation failure. + */ +int register_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *free_hash = NULL; + struct dyn_ftrace *rec; + int ret = -EBUSY; + + mutex_lock(&direct_mutex); + + /* See if there's a direct function at @ip already */ + if (find_rec_direct(ip)) + goto out_unlock; + + ret = -ENODEV; + rec = lookup_rec(ip, ip); + if (!rec) + goto out_unlock; + + /* + * Check if the rec says it has a direct call but we didn't + * find one earlier? + */ + if (WARN_ON(rec->flags & FTRACE_FL_DIRECT)) + goto out_unlock; + + /* Make sure the ip points to the exact record */ + ip = rec->ip; + + ret = -ENOMEM; + if (ftrace_hash_empty(direct_functions) || + direct_functions->count > 2 * (1 << direct_functions->size_bits)) { + struct ftrace_hash *new_hash; + int size = ftrace_hash_empty(direct_functions) ? 0 : + direct_functions->count + 1; + + if (size < 32) + size = 32; + + new_hash = dup_hash(direct_functions, size); + if (!new_hash) + goto out_unlock; + + free_hash = direct_functions; + direct_functions = new_hash; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out_unlock; + + entry->ip = ip; + entry->direct = addr; + __add_hash_entry(direct_functions, entry); + + ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); + if (ret) + remove_hash_entry(direct_functions, entry); + + if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { + ret = register_ftrace_function(&direct_ops); + if (ret) + ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + } + + if (ret) + kfree(entry); + out_unlock: + mutex_unlock(&direct_mutex); + + if (free_hash) { + synchronize_rcu_tasks(); + free_ftrace_hash(free_hash); + } + + return ret; +} +EXPORT_SYMBOL_GPL(register_ftrace_direct); + +int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) { + /* OK if it is off by a little */ + rec = lookup_rec(ip, ip); + if (!rec || rec->ip == ip) + goto out_unlock; + + entry = __ftrace_lookup_ip(direct_functions, rec->ip); + if (!entry) { + WARN_ON(rec->flags & FTRACE_FL_DIRECT); + goto out_unlock; + } + + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); + } + + if (direct_functions->count == 1) + unregister_ftrace_function(&direct_ops); + + ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + + WARN_ON(ret); + + remove_hash_entry(direct_functions, entry); + + out_unlock: + mutex_unlock(&direct_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_direct); +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + /** * ftrace_set_filter_ip - set a function to filter on in ftrace by address * @ops - the ops to set the filter with From 013bf0da0474816f57739daa006c8564ad7396a3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:11:27 -0500 Subject: [PATCH 07/51] ftrace: Add ftrace_find_direct_func() As function_graph tracer modifies the return address to insert a trampoline to trace the return of a function, it must be aware of a direct caller, as when it gets called, the function's return address may not be at on the stack where it expects. It may have to see if that return address points to the a direct caller and adjust if it is. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ++++ kernel/trace/ftrace.c | 79 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index efe3e521aff4..8b37b8105398 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -51,6 +51,7 @@ static inline void early_trace_init(void) { } struct module; struct ftrace_hash; +struct ftrace_direct_func; #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \ defined(CONFIG_DYNAMIC_FTRACE) @@ -248,6 +249,7 @@ static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); +struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); #else static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) { @@ -257,6 +259,10 @@ static inline int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { return -ENODEV; } +static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) +{ + return NULL; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 329a3f3789a1..c4446eabacbe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4938,6 +4938,46 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + +struct ftrace_direct_func { + struct list_head next; + unsigned long addr; + int count; +}; + +static LIST_HEAD(ftrace_direct_funcs); + +/** + * ftrace_find_direct_func - test an address if it is a registered direct caller + * @addr: The address of a registered direct caller + * + * This searches to see if a ftrace direct caller has been registered + * at a specific address, and if so, it returns a descriptor for it. + * + * This can be used by architecture code to see if an address is + * a direct caller (trampoline) attached to a fentry/mcount location. + * This is useful for the function_graph tracer, as it may need to + * do adjustments if it traced a location that also has a direct + * trampoline attached to it. + */ +struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) +{ + struct ftrace_direct_func *entry; + bool found = false; + + /* May be called by fgraph trampoline (protected by rcu tasks) */ + list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) { + if (entry->addr == addr) { + found = true; + break; + } + } + if (found) + return entry; + + return NULL; +} + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -4957,6 +4997,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, */ int register_ftrace_direct(unsigned long ip, unsigned long addr) { + struct ftrace_direct_func *direct; struct ftrace_func_entry *entry; struct ftrace_hash *free_hash = NULL; struct dyn_ftrace *rec; @@ -5005,6 +5046,18 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (!entry) goto out_unlock; + direct = ftrace_find_direct_func(addr); + if (!direct) { + direct = kmalloc(sizeof(*direct), GFP_KERNEL); + if (!direct) { + kfree(entry); + goto out_unlock; + } + direct->addr = addr; + direct->count = 0; + list_add_rcu(&direct->next, &ftrace_direct_funcs); + } + entry->ip = ip; entry->direct = addr; __add_hash_entry(direct_functions, entry); @@ -5019,8 +5072,20 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ftrace_set_filter_ip(&direct_ops, ip, 1, 0); } - if (ret) + if (ret) { kfree(entry); + if (!direct->count) { + list_del_rcu(&direct->next); + synchronize_rcu_tasks(); + kfree(direct); + if (free_hash) + free_ftrace_hash(free_hash); + free_hash = NULL; + } + } else { + if (!direct->count) + direct->count++; + } out_unlock: mutex_unlock(&direct_mutex); @@ -5036,6 +5101,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { struct ftrace_func_entry *entry; + struct ftrace_direct_func *direct; struct dyn_ftrace *rec; int ret = -ENODEV; @@ -5066,6 +5132,17 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) remove_hash_entry(direct_functions, entry); + direct = ftrace_find_direct_func(addr); + if (!WARN_ON(!direct)) { + /* This is the good path (see the ! before WARN) */ + direct->count--; + WARN_ON(direct->count < 0); + if (!direct->count) { + list_del_rcu(&direct->next); + synchronize_rcu_tasks(); + kfree(direct); + } + } out_unlock: mutex_unlock(&direct_mutex); From b06457c83af669bb9ef71f5b84a3132657dce8a1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:12:33 -0500 Subject: [PATCH 08/51] ftrace: Add sample module that uses register_ftrace_direct() Add a sample module that shows a simple use case for regsiter_ftrace_direct(), and how to use it. Signed-off-by: Steven Rostedt (VMware) --- samples/Kconfig | 8 ++++++ samples/Makefile | 1 + samples/ftrace/Makefile | 3 +++ samples/ftrace/ftrace-direct.c | 45 ++++++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+) create mode 100644 samples/ftrace/Makefile create mode 100644 samples/ftrace/ftrace-direct.c diff --git a/samples/Kconfig b/samples/Kconfig index c8dacb4dda80..65b5967dac0c 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -19,6 +19,14 @@ config SAMPLE_TRACE_PRINTK This builds a module that calls trace_printk() and can be used to test various trace_printk() calls from a module. +config SAMPLE_FTRACE_DIRECT + tristate "Build register_ftrace_direct() example" + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m + depends on X86_64 # has x86_64 inlined asm + help + This builds an ftrace direct function example + that hooks to wake_up_process and prints the parameters. + config SAMPLE_KOBJECT tristate "Build kobject examples" help diff --git a/samples/Makefile b/samples/Makefile index 7d6e4ca28d69..cd496d633370 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg/ subdir-$(CONFIG_SAMPLE_SECCOMP) += seccomp obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/ obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ obj-y += vfio-mdev/ subdir-$(CONFIG_SAMPLE_VFS) += vfs diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile new file mode 100644 index 000000000000..3718ab39eba3 --- /dev/null +++ b/samples/ftrace/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c new file mode 100644 index 000000000000..1483f067b000 --- /dev/null +++ b/samples/ftrace/ftrace-direct.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include /* for wake_up_process() */ +#include + +void my_direct_func(struct task_struct *p) +{ + trace_printk("wakeing up %s-%d\n", p->comm, p->pid); +} + +extern void my_tramp(void *); + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" my_tramp:" +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" pushq %rdi\n" +" call my_direct_func\n" +" popq %rdi\n" +" leave\n" +" ret\n" +" .popsection\n" +); + + +static int __init ftrace_direct_init(void) +{ + return register_ftrace_direct((unsigned long)wake_up_process, + (unsigned long)my_tramp); +} + +static void __exit ftrace_direct_exit(void) +{ + unregister_ftrace_direct((unsigned long)wake_up_process, + (unsigned long)my_tramp); +} + +module_init(ftrace_direct_init); +module_exit(ftrace_direct_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("Example use case of using register_ftrace_direct()"); +MODULE_LICENSE("GPL"); From 646f01ccdd59f989f0a1866d2b3503e1855358d8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 12:19:14 -0500 Subject: [PATCH 09/51] ftrace/selftest: Add tests to test register_ftrace_direct() Add two test cases that test the new ftrace direct functionality if the ftrace-direct sample module is available. One test case tests against each available tracer (function, function_graph, mmiotrace, etc), and the other test tests against a kprobe at the same location as the direct caller. Both tests follow the same pattern of testing combinations: enable test (either the tracer or the kprobe) load direct function module unload direct function module disable test enable test load direct function module disable test unload direct function module load direct function module enable test disable test unload direct function module load direct function module enable test unload direct function module disable test As most the bugs in development happened with various ways of enabling or disabling the direct calls with function tracer in one of these combinations. Signed-off-by: Steven Rostedt (VMware) --- .../ftrace/test.d/direct/ftrace-direct.tc | 53 ++++++++++++++ .../ftrace/test.d/direct/kprobe-direct.tc | 71 +++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc create mode 100644 tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc diff --git a/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc b/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc new file mode 100644 index 000000000000..8b8ed3cad51b --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc @@ -0,0 +1,53 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test ftrace direct functions against tracers + +rmmod ftrace-direct ||: +if ! modprobe ftrace-direct ; then + echo "No ftrace-direct sample module - please make CONFIG_SAMPLE_FTRACE_DIRECT=m" + exit_unresolved; +fi + +echo "Let the module run a little" +sleep 1 + +grep -q "my_direct_func: wakeing up" trace + +rmmod ftrace-direct + +test_tracer() { + tracer=$1 + + # tracer -> direct -> no direct > no tracer + echo $tracer > current_tracer + modprobe ftrace-direct + rmmod ftrace-direct + echo nop > current_tracer + + # tracer -> direct -> no tracer > no direct + echo $tracer > current_tracer + modprobe ftrace-direct + echo nop > current_tracer + rmmod ftrace-direct + + # direct -> tracer -> no tracer > no direct + modprobe ftrace-direct + echo $tracer > current_tracer + echo nop > current_tracer + rmmod ftrace-direct + + # direct -> tracer -> no direct > no notracer + modprobe ftrace-direct + echo $tracer > current_tracer + rmmod ftrace-direct + echo nop > current_tracer +} + +for t in `cat available_tracers`; do + if [ "$t" != "nop" ]; then + test_tracer $t + fi +done + +echo nop > current_tracer +rmmod ftrace-direct ||: diff --git a/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc b/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc new file mode 100644 index 000000000000..f030dc80bcde --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc @@ -0,0 +1,71 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test ftrace direct functions against kprobes + +rmmod ftrace-direct ||: +if ! modprobe ftrace-direct ; then + echo "No ftrace-direct sample module - please build with CONFIG_SAMPLE_FTRACE_DIRECT=m" + exit_unresolved; +fi + +if [ ! -f kprobe_events ]; then + echo "No kprobe_events file -please build CONFIG_KPROBE_EVENTS" + exit_unresolved; +fi + +echo "Let the module run a little" +sleep 1 + +grep -q "my_direct_func: wakeing up" trace + +rmmod ftrace-direct + +echo 'p:kwake wake_up_process task=$arg1' > kprobe_events + +start_direct() { + echo > trace + modprobe ftrace-direct + sleep 1 + grep -q "my_direct_func: wakeing up" trace +} + +stop_direct() { + rmmod ftrace-direct +} + +enable_probe() { + echo > trace + echo 1 > events/kprobes/kwake/enable + sleep 1 + grep -q "kwake:" trace +} + +disable_probe() { + echo 0 > events/kprobes/kwake/enable +} + +# probe -> direct -> no direct > no probe +enable_probe +start_direct +stop_direct +disable_probe + +# probe -> direct -> no probe > no direct +enable_probe +start_direct +disable_probe +stop_direct + +# direct -> probe -> no probe > no direct +start_direct +enable_probe +disable_probe +stop_direct + +# direct -> probe -> no direct > no noprobe +start_direct +enable_probe +stop_direct +disable_probe + +echo > kprobe_events From 156473a0ff4f9c209114b4c3ad968099eb541e33 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 15:27:45 -0500 Subject: [PATCH 10/51] ftrace: Add another example of register_ftrace_direct() use case Add another module sample that registers a direct trampoline to a function via register_ftrace_direct(). Having another module that does this allows to test the use case of multiple direct callers registered, as more than one direct caller goes into another path, and is needed to perform proper testing of the register_ftrace_direct() call. Signed-off-by: Steven Rostedt (VMware) --- samples/ftrace/Makefile | 1 + samples/ftrace/ftrace-direct-too.c | 51 ++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 samples/ftrace/ftrace-direct-too.c diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile index 3718ab39eba3..d8217c4e072e 100644 --- a/samples/ftrace/Makefile +++ b/samples/ftrace/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c new file mode 100644 index 000000000000..27efa5f6ff52 --- /dev/null +++ b/samples/ftrace/ftrace-direct-too.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include /* for handle_mm_fault() */ +#include + +void my_direct_func(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + trace_printk("handle mm fault vma=%p address=%lx flags=%x\n", + vma, address, flags); +} + +extern void my_tramp(void *); + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" my_tramp:" +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" pushq %rdi\n" +" pushq %rsi\n" +" pushq %rdx\n" +" call my_direct_func\n" +" popq %rdx\n" +" popq %rsi\n" +" popq %rdi\n" +" leave\n" +" ret\n" +" .popsection\n" +); + + +static int __init ftrace_direct_init(void) +{ + return register_ftrace_direct((unsigned long)handle_mm_fault, + (unsigned long)my_tramp); +} + +static void __exit ftrace_direct_exit(void) +{ + unregister_ftrace_direct((unsigned long)handle_mm_fault, + (unsigned long)my_tramp); +} + +module_init(ftrace_direct_init); +module_exit(ftrace_direct_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("Another example use case of using register_ftrace_direct()"); +MODULE_LICENSE("GPL"); From ed9dafebce52206382de96bd5bbdff22335930f6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 16:08:12 -0500 Subject: [PATCH 11/51] ftrace/selftests: Update the direct call selftests to test two direct calls The register_ftrace_direct() takes a different path if there's already a direct call registered, but this was not tested in the self tests. Now that there's a second direct caller test module, we can use this to test not only one direct caller, but two. Signed-off-by: Steven Rostedt (VMware) --- .../ftrace/test.d/direct/ftrace-direct.tc | 16 ++++++ .../ftrace/test.d/direct/kprobe-direct.tc | 53 ++++++++++++------- 2 files changed, 49 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc b/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc index 8b8ed3cad51b..cbc7a30c2e0f 100644 --- a/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc +++ b/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc @@ -51,3 +51,19 @@ done echo nop > current_tracer rmmod ftrace-direct ||: + +# Now do the same thing with another direct function registered +echo "Running with another ftrace direct function" + +rmmod ftrace-direct-too ||: +modprobe ftrace-direct-too + +for t in `cat available_tracers`; do + if [ "$t" != "nop" ]; then + test_tracer $t + fi +done + +echo nop > current_tracer +rmmod ftrace-direct ||: +rmmod ftrace-direct-too ||: diff --git a/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc b/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc index f030dc80bcde..017a7409b103 100644 --- a/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc +++ b/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc @@ -44,28 +44,41 @@ disable_probe() { echo 0 > events/kprobes/kwake/enable } -# probe -> direct -> no direct > no probe -enable_probe -start_direct -stop_direct -disable_probe +test_kprobes() { + # probe -> direct -> no direct > no probe + enable_probe + start_direct + stop_direct + disable_probe -# probe -> direct -> no probe > no direct -enable_probe -start_direct -disable_probe -stop_direct + # probe -> direct -> no probe > no direct + enable_probe + start_direct + disable_probe + stop_direct -# direct -> probe -> no probe > no direct -start_direct -enable_probe -disable_probe -stop_direct + # direct -> probe -> no probe > no direct + start_direct + enable_probe + disable_probe + stop_direct -# direct -> probe -> no direct > no noprobe -start_direct -enable_probe -stop_direct -disable_probe + # direct -> probe -> no direct > no noprobe + start_direct + enable_probe + stop_direct + disable_probe +} + +test_kprobes + +# Now do this with a second registered direct function +echo "Running with another ftrace direct function" + +modprobe ftrace-direct-too + +test_kprobes + +rmmod ftrace-direct-too echo > kprobe_events From 562955fe6a558b9ef98ad87c470314946338cb2f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:11:39 -0500 Subject: [PATCH 12/51] ftrace/x86: Add register_ftrace_direct() for custom trampolines Enable x86 to allow for register_ftrace_direct(), where a custom trampoline may be called directly from an ftrace mcount/fentry location. Signed-off-by: Steven Rostedt (VMware) --- arch/x86/Kconfig | 1 + arch/x86/include/asm/ftrace.h | 13 +++++++++++++ arch/x86/kernel/ftrace.c | 12 ++++++++++++ arch/x86/kernel/ftrace_64.S | 34 +++++++++++++++++++++++++++------- include/linux/ftrace.h | 6 ++++++ 5 files changed, 59 insertions(+), 7 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d6e1faa28c58..329d9c729ba3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -158,6 +158,7 @@ config X86 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index c38a66661576..c2a7458f912c 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -28,6 +28,19 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) return addr; } +/* + * When a ftrace registered caller is tracing a function that is + * also set by a register_ftrace_direct() call, it needs to be + * differentiated in the ftrace_caller trampoline. To do this, we + * place the direct caller in the ORIG_AX part of pt_regs. This + * tells the ftrace_caller that there's a direct caller. + */ +static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + /* Emulate a call */ + regs->orig_ax = addr; +} + #ifdef CONFIG_DYNAMIC_FTRACE struct dyn_arch_ftrace { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 024c3053dbba..fef283f6341d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1042,6 +1042,18 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; + /* + * If the return location is actually pointing directly to + * the start of a direct trampoline (if we trace the trampoline + * it will still be offset by MCOUNT_INSN_SIZE), then the + * return address is actually off by one word, and we + * need to adjust for that. + */ + if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) { + self_addr = *parent; + parent++; + } + /* * Protect against fault, even if it shouldn't * happen. This tool is too much intrusive to diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 809d54397dba..6ac7ff304886 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -88,6 +88,7 @@ EXPORT_SYMBOL(__fentry__) movq %rdi, RDI(%rsp) movq %r8, R8(%rsp) movq %r9, R9(%rsp) + movq $0, ORIG_RAX(%rsp) /* * Save the original RBP. Even though the mcount ABI does not * require this, it helps out callers. @@ -114,7 +115,11 @@ EXPORT_SYMBOL(__fentry__) subq $MCOUNT_INSN_SIZE, %rdi .endm -.macro restore_mcount_regs +.macro restore_mcount_regs save=0 + + /* ftrace_regs_caller or frame pointers require this */ + movq RBP(%rsp), %rbp + movq R9(%rsp), %r9 movq R8(%rsp), %r8 movq RDI(%rsp), %rdi @@ -123,10 +128,7 @@ EXPORT_SYMBOL(__fentry__) movq RCX(%rsp), %rcx movq RAX(%rsp), %rax - /* ftrace_regs_caller can modify %rbp */ - movq RBP(%rsp), %rbp - - addq $MCOUNT_REG_SIZE, %rsp + addq $MCOUNT_REG_SIZE-\save, %rsp .endm @@ -228,10 +230,28 @@ GLOBAL(ftrace_regs_call) movq R10(%rsp), %r10 movq RBX(%rsp), %rbx - restore_mcount_regs + movq ORIG_RAX(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE-8(%rsp) + + /* If ORIG_RAX is anything but zero, make this a call to that */ + movq ORIG_RAX(%rsp), %rax + cmpq $0, %rax + je 1f + + /* Swap the flags with orig_rax */ + movq MCOUNT_REG_SIZE(%rsp), %rdi + movq %rdi, MCOUNT_REG_SIZE-8(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) + + restore_mcount_regs 8 + + jmp 2f + +1: restore_mcount_regs + /* Restore flags */ - popfq +2: popfq /* * As this jmp to ftrace_epilogue can be a short jump diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8b37b8105398..2bc7bd6b8387 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -272,6 +272,12 @@ static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long a * via ftrace (because there's other callbacks besides the * direct call), can inform the architecture's trampoline that this * routine has a direct caller, and what the caller is. + * + * For example, in x86, it returns the direct caller + * callback function via the regs->orig_ax parameter. + * Then in the ftrace trampoline, if this is set, it makes + * the return from the trampoline jump to the direct caller + * instead of going back to the function it just traced. */ static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) { } From a3ad1a7e39689005cb04a4f2adb82f9d55b4724f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:12:57 -0500 Subject: [PATCH 13/51] ftrace/x86: Add a counter to test function_graph with direct As testing for direct calls from the function graph tracer adds a little overhead (which is a lot when tracing every function), add a counter that can be used to test if function_graph tracer needs to test for a direct caller or not. It would have been nicer if we could use a static branch, but the static branch logic fails when used within the function graph tracer trampoline. Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/ftrace.c | 8 +++++--- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 4 ++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index fef283f6341d..060a361d9d11 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1049,9 +1049,11 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, * return address is actually off by one word, and we * need to adjust for that. */ - if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) { - self_addr = *parent; - parent++; + if (ftrace_direct_func_count) { + if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) { + self_addr = *parent; + parent++; + } } /* diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2bc7bd6b8387..55647e185141 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -247,10 +247,12 @@ static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +extern int ftrace_direct_func_count; int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); #else +# define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) { return -ENODEV; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c4446eabacbe..f9456346ec66 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2364,6 +2364,7 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) /* Protected by rcu_tasks for reading, and direct_mutex for writing */ static struct ftrace_hash *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); +int ftrace_direct_func_count; /* * Search the direct_functions hash to see if the given instruction pointer @@ -5056,6 +5057,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) direct->addr = addr; direct->count = 0; list_add_rcu(&direct->next, &ftrace_direct_funcs); + ftrace_direct_func_count++; } entry->ip = ip; @@ -5081,6 +5083,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (free_hash) free_ftrace_hash(free_hash); free_hash = NULL; + ftrace_direct_func_count--; } } else { if (!direct->count) @@ -5141,6 +5144,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) list_del_rcu(&direct->next); synchronize_rcu_tasks(); kfree(direct); + ftrace_direct_func_count--; } } out_unlock: From 77ac117b3a82251b109ffc5daf7d1c5392734be3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 8 Nov 2019 16:51:00 -0600 Subject: [PATCH 14/51] ftrace/x86: Tell objtool to ignore nondeterministic ftrace stack layout Objtool complains about the new ftrace direct trampoline code: arch/x86/kernel/ftrace_64.o: warning: objtool: ftrace_regs_caller()+0x190: stack state mismatch: cfa1=7+16 cfa2=7+24 Typically, code has a deterministic stack layout, such that at a given instruction address, the stack frame size is always the same. That's not the case for the new ftrace_regs_caller() code after it adjusts the stack for the direct case. Just plead ignorance and assume it's always the non-direct path. Note this creates a tiny window for ORC to get confused. Link: http://lkml.kernel.org/r/20191108225100.ea3bhsbdf6oerj6g@treble Reported-by: Steven Rostedt Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (VMware) --- arch/x86/include/asm/unwind_hints.h | 8 ++++++++ arch/x86/kernel/ftrace_64.S | 12 +++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 0bcdb1279361..f5e2eb12cb71 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -86,6 +86,14 @@ UNWIND_HINT sp_offset=\sp_offset .endm +.macro UNWIND_HINT_SAVE + UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE +.endm + +.macro UNWIND_HINT_RESTORE + UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE +.endm + #else /* !__ASSEMBLY__ */ #define UNWIND_HINT(sp_reg, sp_offset, type, end) \ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 6ac7ff304886..b33abdd0a2db 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -178,6 +178,8 @@ ENTRY(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq + UNWIND_HINT_SAVE + /* added 8 bytes to save flags */ save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ @@ -250,8 +252,16 @@ GLOBAL(ftrace_regs_call) 1: restore_mcount_regs +2: + /* + * The stack layout is nondetermistic here, depending on which path was + * taken. This confuses objtool and ORC, rightfully so. For now, + * pretend the stack always looks like the non-direct case. + */ + UNWIND_HINT_RESTORE + /* Restore flags */ -2: popfq + popfq /* * As this jmp to ftrace_epilogue can be a short jump From da537f0aef1372c5204356a7df06be8769467b7b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 1 Oct 2019 14:38:07 -0400 Subject: [PATCH 15/51] ftrace: Add information on number of page groups allocated Looking for ways to shrink the size of the dyn_ftrace structure, knowing the information about how many pages and the number of groups of those pages, is useful in working out the best ways to save on memory. This adds one info print on how many groups of pages were used to allocate the ftrace dyn_ftrace structures, and also shows the number of pages and groups in the dyn_ftrace_total_info (which is used for debugging). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 ++++++++++++++ kernel/trace/trace.c | 21 +++++++++++++++------ kernel/trace/trace.h | 2 ++ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9456346ec66..d2d488c43a6a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2991,6 +2991,8 @@ static void ftrace_shutdown_sysctl(void) static u64 ftrace_update_time; unsigned long ftrace_update_tot_cnt; +unsigned long ftrace_number_of_pages; +unsigned long ftrace_number_of_groups; static inline int ops_traces_mod(struct ftrace_ops *ops) { @@ -3115,6 +3117,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) goto again; } + ftrace_number_of_pages += 1 << order; + ftrace_number_of_groups++; + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; pg->size = cnt; @@ -3170,6 +3175,8 @@ ftrace_allocate_pages(unsigned long num_to_init) start_pg = pg->next; kfree(pg); pg = start_pg; + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; } pr_info("ftrace: FAILED to allocate memory for functions\n"); return NULL; @@ -6173,6 +6180,8 @@ void ftrace_release_mod(struct module *mod) free_pages((unsigned long)pg->records, order); tmp_page = pg->next; kfree(pg); + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; } } @@ -6514,6 +6523,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; kfree(pg); pg = container_of(last_pg, struct ftrace_page, next); if (!(*last_pg)) @@ -6569,6 +6580,9 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); + pr_info("ftrace: allocated %ld pages with %ld groups\n", + ftrace_number_of_pages, ftrace_number_of_groups); + set_ftrace_early_filters(); return; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6a0ee9178365..5ea8c7c0f2d7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7583,14 +7583,23 @@ static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - unsigned long *p = filp->private_data; - char buf[64]; /* Not too big for a shallow stack */ + ssize_t ret; + char *buf; int r; - r = scnprintf(buf, 63, "%ld", *p); - buf[r++] = '\n'; + /* 256 should be plenty to hold the amount needed */ + buf = kmalloc(256, GFP_KERNEL); + if (!buf) + return -ENOMEM; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n", + ftrace_update_tot_cnt, + ftrace_number_of_pages, + ftrace_number_of_groups); + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + kfree(buf); + return ret; } static const struct file_operations tracing_dyn_info_fops = { @@ -8782,7 +8791,7 @@ static __init int tracer_init_tracefs(void) #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, &tracing_dyn_info_fops); + NULL, &tracing_dyn_info_fops); #endif create_trace_instances(d_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d685c61085c0..8b590f10bc72 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -804,6 +804,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +extern unsigned long ftrace_number_of_pages; +extern unsigned long ftrace_number_of_groups; void ftrace_init_trace_array(struct trace_array *tr); #else static inline void ftrace_init_trace_array(struct trace_array *tr) { } From 91edde2e6ae1dd5e33812f076f3fe4cb7ccbfdd0 Mon Sep 17 00:00:00 2001 From: "Viktor Rosendahl (BMW)" Date: Wed, 9 Oct 2019 00:08:21 +0200 Subject: [PATCH 16/51] ftrace: Implement fs notification for tracing_max_latency This patch implements the feature that the tracing_max_latency file, e.g. /sys/kernel/debug/tracing/tracing_max_latency will receive notifications through the fsnotify framework when a new latency is available. One particularly interesting use of this facility is when enabling threshold tracing, through /sys/kernel/debug/tracing/tracing_thresh, together with the preempt/irqsoff tracers. This makes it possible to implement a user space program that can, with equal probability, obtain traces of latencies that occur immediately after each other in spite of the fact that the preempt/irqsoff tracers operate in overwrite mode. This facility works with the hwlat, preempt/irqsoff, and wakeup tracers. The tracers may call the latency_fsnotify() from places such as __schedule() or do_idle(); this makes it impossible to call queue_work() directly without risking a deadlock. The same would happen with a softirq, kernel thread or tasklet. For this reason we use the irq_work mechanism to call queue_work(). This patch creates a new workqueue. The reason for doing this is that I wanted to use the WQ_UNBOUND and WQ_HIGHPRI flags. My thinking was that WQ_UNBOUND might help with the latency in some important cases. If we use: queue_work(system_highpri_wq, &tr->fsnotify_work); then the work will (almost) always execute on the same CPU but if we are unlucky that CPU could be too busy while there could be another CPU in the system that would be able to process the work soon enough. queue_work_on() could be used to queue the work on another CPU but it seems difficult to select the right CPU. Link: http://lkml.kernel.org/r/20191008220824.7911-2-viktor.rosendahl@gmail.com Reviewed-by: Joel Fernandes (Google) Signed-off-by: Viktor Rosendahl (BMW) [ Added max() to have one compare for max latency ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 75 +++++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 18 +++++++++ kernel/trace/trace_hwlat.c | 11 ++++-- 3 files changed, 98 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5ea8c7c0f2d7..f093a433cb42 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -45,6 +45,9 @@ #include #include #include +#include +#include +#include #include "trace.h" #include "trace_output.h" @@ -1497,6 +1500,74 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) } unsigned long __read_mostly tracing_thresh; +static const struct file_operations tracing_max_lat_fops; + +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +static struct workqueue_struct *fsnotify_wq; + +static void latency_fsnotify_workfn(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify(tr->d_max_latency->d_inode, FS_MODIFY, + tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); +} + +static void latency_fsnotify_workfn_irq(struct irq_work *iwork) +{ + struct trace_array *tr = container_of(iwork, struct trace_array, + fsnotify_irqwork); + queue_work(fsnotify_wq, &tr->fsnotify_work); +} + +static void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); + tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, + d_tracer, &tr->max_latency, + &tracing_max_lat_fops); +} + +__init static int latency_fsnotify_init(void) +{ + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!fsnotify_wq) { + pr_err("Unable to allocate tr_max_lat_wq\n"); + return -ENOMEM; + } + return 0; +} + +late_initcall_sync(latency_fsnotify_init); + +void latency_fsnotify(struct trace_array *tr) +{ + if (!fsnotify_wq) + return; + /* + * We cannot call queue_work(&tr->fsnotify_work) from here because it's + * possible that we are called from __schedule() or do_idle(), which + * could cause a deadlock. + */ + irq_work_queue(&tr->fsnotify_irqwork); +} + +/* + * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + * defined(CONFIG_FSNOTIFY) + */ +#else + +#define trace_create_maxlat_file(tr, d_tracer) \ + trace_create_file("tracing_max_latency", 0644, d_tracer, \ + &tr->max_latency, &tracing_max_lat_fops) + +#endif #ifdef CONFIG_TRACER_MAX_TRACE /* @@ -1536,6 +1607,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) /* record this tasks comm */ tracing_record_cmdline(tsk); + latency_fsnotify(tr); } /** @@ -8594,8 +8666,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tr->max_latency, &tracing_max_lat_fops); + trace_create_maxlat_file(tr, d_tracer); #endif if (ftrace_create_function_files(tr, d_tracer)) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8b590f10bc72..718eb998c13e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -16,6 +16,8 @@ #include #include #include +#include +#include #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ @@ -264,6 +266,11 @@ struct trace_array { #endif #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) unsigned long max_latency; +#ifdef CONFIG_FSNOTIFY + struct dentry *d_max_latency; + struct work_struct fsnotify_work; + struct irq_work fsnotify_irqwork; +#endif #endif struct trace_pid_list __rcu *filtered_pids; /* @@ -786,6 +793,17 @@ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +void latency_fsnotify(struct trace_array *tr); + +#else + +static void latency_fsnotify(struct trace_array *tr) { } + +#endif + #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 862f4b0139fc..63526670605a 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -237,6 +237,7 @@ static int get_sample(void) /* If we exceed the threshold value, we have found a hardware latency */ if (sample > thresh || outer_sample > thresh) { struct hwlat_sample s; + u64 latency; ret = 1; @@ -253,11 +254,13 @@ static int get_sample(void) s.nmi_count = nmi_count; trace_hwlat_sample(&s); + latency = max(sample, outer_sample); + /* Keep a running maximum ever recorded hardware latency */ - if (sample > tr->max_latency) - tr->max_latency = sample; - if (outer_sample > tr->max_latency) - tr->max_latency = outer_sample; + if (latency > tr->max_latency) { + tr->max_latency = latency; + latency_fsnotify(tr); + } } out: From 793937236d1ee032d2ee5ccc27bdd280a04e766e Mon Sep 17 00:00:00 2001 From: "Viktor Rosendahl (BMW)" Date: Wed, 9 Oct 2019 00:08:22 +0200 Subject: [PATCH 17/51] preemptirq_delay_test: Add the burst feature and a sysfs trigger This burst feature enables the user to generate a burst of preempt/irqsoff latencies. This makes it possible to test whether we are able to detect latencies that systematically occur very close to each other. The maximum burst size is 10. We also create 10 identical test functions, so that we get 10 different backtraces; this is useful when we want to test whether we can detect all the latencies in a burst. Otherwise, there would be no easy way of differentiating between which latency in a burst was captured by the tracer. In addition, there is a sysfs trigger, so that it's not necessary to reload the module to repeat the test. The trigger will appear as /sys/kernel/preemptirq_delay_test/trigger in sysfs. Link: http://lkml.kernel.org/r/20191008220824.7911-3-viktor.rosendahl@gmail.com Reviewed-by: Joel Fernandes (Google) Signed-off-by: Viktor Rosendahl (BMW) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 6 +- kernel/trace/preemptirq_delay_test.c | 144 +++++++++++++++++++++++---- 2 files changed, 128 insertions(+), 22 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 624a05e99b0b..d25314bc7a1c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -760,9 +760,9 @@ config PREEMPTIRQ_DELAY_TEST configurable delay. The module busy waits for the duration of the critical section. - For example, the following invocation forces a one-time irq-disabled - critical section for 500us: - modprobe preemptirq_delay_test test_mode=irq delay=500000 + For example, the following invocation generates a burst of three + irq-disabled critical sections for 500us: + modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3 If unsure, say N diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index d8765c952fab..31c0fad4cb9e 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -10,18 +10,25 @@ #include #include #include +#include #include #include #include #include +#include static ulong delay = 100; -static char test_mode[10] = "irq"; +static char test_mode[12] = "irq"; +static uint burst_size = 1; -module_param_named(delay, delay, ulong, S_IRUGO); -module_param_string(test_mode, test_mode, 10, S_IRUGO); -MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)"); -MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)"); +module_param_named(delay, delay, ulong, 0444); +module_param_string(test_mode, test_mode, 12, 0444); +module_param_named(burst_size, burst_size, uint, 0444); +MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)"); +MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)"); +MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)"); + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) static void busy_wait(ulong time) { @@ -34,37 +41,136 @@ static void busy_wait(ulong time) } while ((end - start) < (time * 1000)); } -static int preemptirq_delay_run(void *data) +static __always_inline void irqoff_test(void) { unsigned long flags; + local_irq_save(flags); + busy_wait(delay); + local_irq_restore(flags); +} - if (!strcmp(test_mode, "irq")) { - local_irq_save(flags); - busy_wait(delay); - local_irq_restore(flags); - } else if (!strcmp(test_mode, "preempt")) { - preempt_disable(); - busy_wait(delay); - preempt_enable(); +static __always_inline void preemptoff_test(void) +{ + preempt_disable(); + busy_wait(delay); + preempt_enable(); +} + +static void execute_preemptirqtest(int idx) +{ + if (!strcmp(test_mode, "irq")) + irqoff_test(); + else if (!strcmp(test_mode, "preempt")) + preemptoff_test(); + else if (!strcmp(test_mode, "alternate")) { + if (idx % 2 == 0) + irqoff_test(); + else + preemptoff_test(); } +} +#define DECLARE_TESTFN(POSTFIX) \ + static void preemptirqtest_##POSTFIX(int idx) \ + { \ + execute_preemptirqtest(idx); \ + } \ + +/* + * We create 10 different functions, so that we can get 10 different + * backtraces. + */ +DECLARE_TESTFN(0) +DECLARE_TESTFN(1) +DECLARE_TESTFN(2) +DECLARE_TESTFN(3) +DECLARE_TESTFN(4) +DECLARE_TESTFN(5) +DECLARE_TESTFN(6) +DECLARE_TESTFN(7) +DECLARE_TESTFN(8) +DECLARE_TESTFN(9) + +static void (*testfuncs[])(int) = { + preemptirqtest_0, + preemptirqtest_1, + preemptirqtest_2, + preemptirqtest_3, + preemptirqtest_4, + preemptirqtest_5, + preemptirqtest_6, + preemptirqtest_7, + preemptirqtest_8, + preemptirqtest_9, +}; + +#define NR_TEST_FUNCS ARRAY_SIZE(testfuncs) + +static int preemptirq_delay_run(void *data) +{ + int i; + int s = MIN(burst_size, NR_TEST_FUNCS); + + for (i = 0; i < s; i++) + (testfuncs[i])(i); return 0; } -static int __init preemptirq_delay_init(void) +static struct task_struct *preemptirq_start_test(void) { char task_name[50]; - struct task_struct *test_task; snprintf(task_name, sizeof(task_name), "%s_test", test_mode); + return kthread_run(preemptirq_delay_run, NULL, task_name); +} - test_task = kthread_run(preemptirq_delay_run, NULL, task_name); - return PTR_ERR_OR_ZERO(test_task); + +static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + preemptirq_start_test(); + return count; +} + +static struct kobj_attribute trigger_attribute = + __ATTR(trigger, 0200, NULL, trigger_store); + +static struct attribute *attrs[] = { + &trigger_attribute.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +static struct kobject *preemptirq_delay_kobj; + +static int __init preemptirq_delay_init(void) +{ + struct task_struct *test_task; + int retval; + + test_task = preemptirq_start_test(); + retval = PTR_ERR_OR_ZERO(test_task); + if (retval != 0) + return retval; + + preemptirq_delay_kobj = kobject_create_and_add("preemptirq_delay_test", + kernel_kobj); + if (!preemptirq_delay_kobj) + return -ENOMEM; + + retval = sysfs_create_group(preemptirq_delay_kobj, &attr_group); + if (retval) + kobject_put(preemptirq_delay_kobj); + + return retval; } static void __exit preemptirq_delay_exit(void) { - return; + kobject_put(preemptirq_delay_kobj); } module_init(preemptirq_delay_init) From 9c34fc4b7e903117cc27712b9e6c8690debb7e95 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:20 +0200 Subject: [PATCH 18/51] tracing: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Add additional header output for PREEMPT_RT. Link: http://lkml.kernel.org/r/20191015191821.11479-34-bigeasy@linutronix.de Cc: Ingo Molnar Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace-uses.rst | 2 +- kernel/trace/trace.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index 740bd0224d35..2a05e770618a 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -146,7 +146,7 @@ FTRACE_OPS_FL_RECURSION_SAFE itself or any nested functions that those functions call. If this flag is set, it is possible that the callback will also - be called with preemption enabled (when CONFIG_PREEMPT is set), + be called with preemption enabled (when CONFIG_PREEMPTION is set), but this is not guaranteed. FTRACE_OPS_FL_IPMODIFY diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f093a433cb42..db7d06a26861 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3726,6 +3726,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) "desktop", #elif defined(CONFIG_PREEMPT) "preempt", +#elif defined(CONFIG_PREEMPT_RT) + "preempt_rt", #else "unknown", #endif From 6dff4d7dd3e0158688683a17dd792861aa9d61e2 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 15 Oct 2019 13:10:12 +0100 Subject: [PATCH 19/51] tracing: Make internal ftrace events static The event_class_ftrace_##call and event_##call do not seem to be used outside of trace_export.c so make them both static to avoid a number of sparse warnings: kernel/trace/trace_entries.h:59:1: warning: symbol 'event_class_ftrace_function' was not declared. Should it be static? kernel/trace/trace_entries.h:59:1: warning: symbol '__event_function' was not declared. Should it be static? kernel/trace/trace_entries.h:77:1: warning: symbol 'event_class_ftrace_funcgraph_entry' was not declared. Should it be static? kernel/trace/trace_entries.h:77:1: warning: symbol '__event_funcgraph_entry' was not declared. Should it be static? kernel/trace/trace_entries.h:93:1: warning: symbol 'event_class_ftrace_funcgraph_exit' was not declared. Should it be static? kernel/trace/trace_entries.h:93:1: warning: symbol '__event_funcgraph_exit' was not declared. Should it be static? kernel/trace/trace_entries.h:129:1: warning: symbol 'event_class_ftrace_context_switch' was not declared. Should it be static? kernel/trace/trace_entries.h:129:1: warning: symbol '__event_context_switch' was not declared. Should it be static? kernel/trace/trace_entries.h:149:1: warning: symbol 'event_class_ftrace_wakeup' was not declared. Should it be static? kernel/trace/trace_entries.h:149:1: warning: symbol '__event_wakeup' was not declared. Should it be static? kernel/trace/trace_entries.h:171:1: warning: symbol 'event_class_ftrace_kernel_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:171:1: warning: symbol '__event_kernel_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:191:1: warning: symbol 'event_class_ftrace_user_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:191:1: warning: symbol '__event_user_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:214:1: warning: symbol 'event_class_ftrace_bprint' was not declared. Should it be static? kernel/trace/trace_entries.h:214:1: warning: symbol '__event_bprint' was not declared. Should it be static? kernel/trace/trace_entries.h:230:1: warning: symbol 'event_class_ftrace_print' was not declared. Should it be static? kernel/trace/trace_entries.h:230:1: warning: symbol '__event_print' was not declared. Should it be static? kernel/trace/trace_entries.h:247:1: warning: symbol 'event_class_ftrace_raw_data' was not declared. Should it be static? kernel/trace/trace_entries.h:247:1: warning: symbol '__event_raw_data' was not declared. Should it be static? kernel/trace/trace_entries.h:262:1: warning: symbol 'event_class_ftrace_bputs' was not declared. Should it be static? kernel/trace/trace_entries.h:262:1: warning: symbol '__event_bputs' was not declared. Should it be static? kernel/trace/trace_entries.h:277:1: warning: symbol 'event_class_ftrace_mmiotrace_rw' was not declared. Should it be static? kernel/trace/trace_entries.h:277:1: warning: symbol '__event_mmiotrace_rw' was not declared. Should it be static? kernel/trace/trace_entries.h:298:1: warning: symbol 'event_class_ftrace_mmiotrace_map' was not declared. Should it be static? kernel/trace/trace_entries.h:298:1: warning: symbol '__event_mmiotrace_map' was not declared. Should it be static? kernel/trace/trace_entries.h:322:1: warning: symbol 'event_class_ftrace_branch' was not declared. Should it be static? kernel/trace/trace_entries.h:322:1: warning: symbol '__event_branch' was not declared. Should it be static? kernel/trace/trace_entries.h:343:1: warning: symbol 'event_class_ftrace_hwlat' was not declared. Should it be static? kernel/trace/trace_entries.h:343:1: warning: symbol '__event_hwlat' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20191015121012.18824-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_export.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 45630a76ed3a..2e6d2e9741cc 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -171,7 +171,7 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \ #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ regfn) \ \ -struct trace_event_class __refdata event_class_ftrace_##call = { \ +static struct trace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ @@ -187,7 +187,7 @@ struct trace_event_call __used event_##call = { \ .print_fmt = print, \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ -struct trace_event_call __used \ +static struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY From 2d6425af61166e026e7476db64f70f1266127b1d Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:23 -0700 Subject: [PATCH 20/51] tracing: Declare newly exported APIs in include/linux/trace.h Declare the newly introduced and exported APIs in the header file - include/linux/trace.h. Moving previous declarations from kernel/trace/trace.h to include/linux/trace.h. Link: http://lkml.kernel.org/r/1565805327-579-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace.h | 7 +++++++ kernel/trace/trace.h | 4 +--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/trace.h b/include/linux/trace.h index b95ffb2188ab..24fcf07812ae 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -24,6 +24,13 @@ struct trace_export { int register_ftrace_export(struct trace_export *export); int unregister_ftrace_export(struct trace_export *export); +struct trace_array; + +void trace_printk_init_buffers(void); +int trace_array_printk(struct trace_array *tr, unsigned long ip, + const char *fmt, ...); +struct trace_array *trace_array_create(const char *name); +int trace_array_destroy(struct trace_array *tr); #endif /* CONFIG_TRACING */ #endif /* _LINUX_TRACE_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 718eb998c13e..90cba68c8b50 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -873,8 +874,6 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...); int trace_array_printk_buf(struct ring_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); @@ -1890,7 +1889,6 @@ extern const char *__start___tracepoint_str[]; extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); -void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); From e585e6469d6f476b82aa148dc44aaf7ae269a4e2 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:24 -0700 Subject: [PATCH 21/51] tracing: Verify if trace array exists before destroying it. A trace array can be destroyed from userspace or kernel. Verify if the trace array exists before proceeding to destroy/remove it. Link: http://lkml.kernel.org/r/1565805327-579-3-git-send-email-divya.indi@oracle.com Reviewed-by: Aruna Ramakrishna Signed-off-by: Divya Indi [ Removed unneeded braces ] Signed-off-by: Steven Rostedt (VMware) --- kernel/module.c | 6 +++++- kernel/trace/trace.c | 15 ++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index ff2d7359a418..6e2fd40a6ed9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3728,7 +3728,6 @@ static int complete_formation(struct module *mod, struct load_info *info) module_enable_ro(mod, false); module_enable_nx(mod); - module_enable_x(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ @@ -3751,6 +3750,11 @@ static int prepare_coming_module(struct module *mod) if (err) return err; + /* Make module executable after ftrace is enabled */ + mutex_lock(&module_mutex); + module_enable_x(mod); + mutex_unlock(&module_mutex); + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index db7d06a26861..fa4f742fc449 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8556,17 +8556,26 @@ static int __remove_instance(struct trace_array *tr) return 0; } -int trace_array_destroy(struct trace_array *tr) +int trace_array_destroy(struct trace_array *this_tr) { + struct trace_array *tr; int ret; - if (!tr) + if (!this_tr) return -EINVAL; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - ret = __remove_instance(tr); + ret = -ENODEV; + + /* Making sure trace array exists before destroying it. */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + ret = __remove_instance(tr); + break; + } + } mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); From 953ae45a0c25e09428d4a03d7654f97ab8a36647 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:25 -0700 Subject: [PATCH 22/51] tracing: Adding NULL checks for trace_array descriptor pointer As part of commit f45d1225adb0 ("tracing: Kernel access to Ftrace instances") we exported certain functions. Here, we are adding some additional NULL checks to ensure safe usage by users of these APIs. Link: http://lkml.kernel.org/r/1565805327-579-4-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_events.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa4f742fc449..79fe4d6ecbd8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3297,6 +3297,9 @@ int trace_array_printk(struct trace_array *tr, if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; + if (!tr) + return -ENOENT; + va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fba87d10f0c1..2a3ac2365445 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -793,6 +793,8 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) char *event = NULL, *sub = NULL, *match; int ret; + if (!tr) + return -ENOENT; /* * The buf format can be : * *: means any event by that name. From b83b43ffc6e4b514ca034a0fbdee01322e2f7022 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 15 Oct 2019 09:00:55 -0400 Subject: [PATCH 23/51] fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub The C compiler is allowing more checks to make sure that function pointers are assigned to the correct prototype function. Unfortunately, the function graph tracer uses a special name with its assigned ftrace_graph_return function pointer that maps to a stub function used by the function tracer (ftrace_stub). The ftrace_graph_return variable is compared to the ftrace_stub in some archs to know if the function graph tracer is enabled or not. This means we can not just simply create a new function stub that compares it without modifying all the archs. Instead, have the linker script create a function_graph_stub that maps to ftrace_stub, and this way we can define the prototype for it to match the prototype of ftrace_graph_return, and make the compiler checks all happy! Link: http://lkml.kernel.org/r/20191015090055.789a0aed@gandalf.local.home Cc: linux-sh@vger.kernel.org Cc: Yoshinori Sato Cc: Rich Felker Reported-by: Sami Tolvanen Signed-off-by: Steven Rostedt (VMware) --- arch/sh/boot/compressed/misc.c | 5 +++++ include/asm-generic/vmlinux.lds.h | 17 ++++++++++++++--- kernel/trace/fgraph.c | 11 ++++++++--- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c index c15cac9251b9..e69ec12cbbe6 100644 --- a/arch/sh/boot/compressed/misc.c +++ b/arch/sh/boot/compressed/misc.c @@ -111,6 +111,11 @@ void __stack_chk_fail(void) error("stack-protector: Kernel stack is corrupted\n"); } +/* Needed because vmlinux.lds.h references this */ +void ftrace_stub(void) +{ +} + #ifdef CONFIG_SUPERH64 #define stackalign 8 #else diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index dae64600ccbf..0f358be551cd 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -111,18 +111,29 @@ #ifdef CONFIG_FTRACE_MCOUNT_RECORD #ifdef CC_USING_PATCHABLE_FUNCTION_ENTRY +/* + * Need to also make ftrace_graph_stub point to ftrace_stub + * so that the same stub location may have different protocols + * and not mess up with C verifiers. + */ #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__patchable_function_entries)) \ - __stop_mcount_loc = .; + __stop_mcount_loc = .; \ + ftrace_graph_stub = ftrace_stub; #else #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__mcount_loc)) \ - __stop_mcount_loc = .; + __stop_mcount_loc = .; \ + ftrace_graph_stub = ftrace_stub; #endif #else -#define MCOUNT_REC() +# ifdef CONFIG_FUNCTION_TRACER +# define MCOUNT_REC() ftrace_graph_stub = ftrace_stub; +# else +# define MCOUNT_REC() +# endif #endif #ifdef CONFIG_TRACE_BRANCH_PROFILING diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 7950a0356042..fa3ce10d0405 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -332,9 +332,14 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) return 0; } +/* + * Simply points to ftrace_stub, but with the proper protocol. + * Defined by the linker script in linux/vmlinux.lds.h + */ +extern void ftrace_graph_stub(struct ftrace_graph_ret *); + /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = - (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -614,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) goto out; ftrace_graph_active--; - ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_return = ftrace_graph_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); From b43e78f65b1d35fd3e13c7b23f9b64ea83c9ad3a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 13 Nov 2019 11:48:39 -0500 Subject: [PATCH 24/51] tracing/selftests: Turn off timeout setting As the ftrace selftests can run for a long period of time, disable the timeout that the general selftests have. If a selftest hangs, then it probably means the machine will hang too. Link: https://lore.kernel.org/r/alpine.LSU.2.21.1911131604170.18679@pobox.suse.cz Suggested-by: Miroslav Benes Tested-by: Miroslav Benes Reviewed-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/ftrace/settings | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/testing/selftests/ftrace/settings diff --git a/tools/testing/selftests/ftrace/settings b/tools/testing/selftests/ftrace/settings new file mode 100644 index 000000000000..e7b9417537fb --- /dev/null +++ b/tools/testing/selftests/ftrace/settings @@ -0,0 +1 @@ +timeout=0 From 52ae533b8a18e7ca868e7ac5953ad7258210f320 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Oct 2019 16:56:54 +0300 Subject: [PATCH 25/51] lib/sort: Move swap, cmp and cmp_r function types for wider use The function types for swap, cmp and cmp_r functions are already being in use by modules. Move them to types.h that everybody in kernel will be able to use generic types instead of custom ones. This adds more sense to the comment in bsearch() later on. Link: http://lkml.kernel.org/r/20191007135656.37734-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Steven Rostedt (VMware) --- include/linux/sort.h | 8 ++++---- include/linux/types.h | 5 +++++ lib/sort.c | 15 +++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/sort.h b/include/linux/sort.h index 61b96d0ebc44..b5898725fe9d 100644 --- a/include/linux/sort.h +++ b/include/linux/sort.h @@ -5,12 +5,12 @@ #include void sort_r(void *base, size_t num, size_t size, - int (*cmp)(const void *, const void *, const void *), - void (*swap)(void *, void *, int), + cmp_r_func_t cmp_func, + swap_func_t swap_func, const void *priv); void sort(void *base, size_t num, size_t size, - int (*cmp)(const void *, const void *), - void (*swap)(void *, void *, int)); + cmp_func_t cmp_func, + swap_func_t swap_func); #endif diff --git a/include/linux/types.h b/include/linux/types.h index 05030f608be3..85c0e7b18153 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -225,5 +225,10 @@ struct callback_head { typedef void (*rcu_callback_t)(struct rcu_head *head); typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func); +typedef void (*swap_func_t)(void *a, void *b, int size); + +typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv); +typedef int (*cmp_func_t)(const void *a, const void *b); + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_TYPES_H */ diff --git a/lib/sort.c b/lib/sort.c index d54cf97e9548..3ad454411997 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -117,8 +117,6 @@ static void swap_bytes(void *a, void *b, size_t n) } while (n); } -typedef void (*swap_func_t)(void *a, void *b, int size); - /* * The values are arbitrary as long as they can't be confused with * a pointer, but small integers make for the smallest compare @@ -144,12 +142,9 @@ static void do_swap(void *a, void *b, size_t size, swap_func_t swap_func) swap_func(a, b, (int)size); } -typedef int (*cmp_func_t)(const void *, const void *); -typedef int (*cmp_r_func_t)(const void *, const void *, const void *); #define _CMP_WRAPPER ((cmp_r_func_t)0L) -static int do_cmp(const void *a, const void *b, - cmp_r_func_t cmp, const void *priv) +static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) { if (cmp == _CMP_WRAPPER) return ((cmp_func_t)(priv))(a, b); @@ -202,8 +197,8 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size) * it less suitable for kernel use. */ void sort_r(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, const void *), - void (*swap_func)(void *, void *, int size), + cmp_r_func_t cmp_func, + swap_func_t swap_func, const void *priv) { /* pre-scale counters for performance */ @@ -269,8 +264,8 @@ void sort_r(void *base, size_t num, size_t size, EXPORT_SYMBOL(sort_r); void sort(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *), - void (*swap_func)(void *, void *, int size)) + cmp_func_t cmp_func, + swap_func_t swap_func) { return sort_r(base, num, size, _CMP_WRAPPER, swap_func, cmp_func); } From e8877ec5dbba6f39d25ca3a81716c23b1760f2ee Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Oct 2019 16:56:55 +0300 Subject: [PATCH 26/51] lib/bsearch: Use generic type for comparator function Comparator function type, cmp_func_t, is defined in the types.h, use it in bsearch() and, thus, add more sense to the corresponding comment in the code. Link: http://lkml.kernel.org/r/20191007135656.37734-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Steven Rostedt (VMware) --- include/linux/bsearch.h | 2 +- lib/bsearch.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bsearch.h b/include/linux/bsearch.h index 62b1eb348858..8ed53d7524ea 100644 --- a/include/linux/bsearch.h +++ b/include/linux/bsearch.h @@ -5,6 +5,6 @@ #include void *bsearch(const void *key, const void *base, size_t num, size_t size, - int (*cmp)(const void *key, const void *elt)); + cmp_func_t cmp); #endif /* _LINUX_BSEARCH_H */ diff --git a/lib/bsearch.c b/lib/bsearch.c index 8baa83968162..8b3aae5ae77a 100644 --- a/lib/bsearch.c +++ b/lib/bsearch.c @@ -29,7 +29,7 @@ * the same comparison function for both sort() and bsearch(). */ void *bsearch(const void *key, const void *base, size_t num, size_t size, - int (*cmp)(const void *key, const void *elt)) + cmp_func_t cmp) { const char *pivot; int result; From 80042c8f06bf5a7b87a63deaa3deb56f2cd52645 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Oct 2019 16:56:56 +0300 Subject: [PATCH 27/51] tracing: Use generic type for comparator function Comparator function type, cmp_func_t, is defined in the types.h, use it in the code. Link: http://lkml.kernel.org/r/20191007135656.37734-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 12 ++++++------ kernel/trace/trace_branch.c | 8 ++++---- kernel/trace/trace_stat.c | 6 ++---- kernel/trace/trace_stat.h | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d2d488c43a6a..82ef8d60a42b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -465,10 +465,10 @@ static void *function_stat_start(struct tracer_stat *trace) #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* function graph compares on total time */ -static int function_stat_cmp(void *p1, void *p2) +static int function_stat_cmp(const void *p1, const void *p2) { - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; if (a->time < b->time) return -1; @@ -479,10 +479,10 @@ static int function_stat_cmp(void *p1, void *p2) } #else /* not function graph compares against hits */ -static int function_stat_cmp(void *p1, void *p2) +static int function_stat_cmp(const void *p1, const void *p2) { - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; if (a->counter < b->counter) return -1; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 3ea65cdff30d..88e158d27965 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -244,7 +244,7 @@ static int annotated_branch_stat_headers(struct seq_file *m) return 0; } -static inline long get_incorrect_percent(struct ftrace_branch_data *p) +static inline long get_incorrect_percent(const struct ftrace_branch_data *p) { long percent; @@ -332,10 +332,10 @@ annotated_branch_stat_next(void *v, int idx) return p; } -static int annotated_branch_stat_cmp(void *p1, void *p2) +static int annotated_branch_stat_cmp(const void *p1, const void *p2) { - struct ftrace_branch_data *a = p1; - struct ftrace_branch_data *b = p2; + const struct ftrace_branch_data *a = p1; + const struct ftrace_branch_data *b = p2; long percent_a, percent_b; diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 9ab0a1a7ad5e..874f1274cf99 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -72,9 +72,7 @@ static void destroy_session(struct stat_session *session) kfree(session); } -typedef int (*cmp_stat_t)(void *, void *); - -static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) +static int insert_stat(struct rb_root *root, void *stat, cmp_func_t cmp) { struct rb_node **new = &(root->rb_node), *parent = NULL; struct stat_node *data; @@ -112,7 +110,7 @@ static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) * This one will force an insertion as right-most node * in the rbtree. */ -static int dummy_cmp(void *p1, void *p2) +static int dummy_cmp(const void *p1, const void *p2) { return -1; } diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 8786d17caf49..31d7dc5bf1db 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -16,7 +16,7 @@ struct tracer_stat { void *(*stat_start)(struct tracer_stat *trace); void *(*stat_next)(void *prev, int idx); /* Compare two entries for stats sorting */ - int (*stat_cmp)(void *p1, void *p2); + cmp_func_t stat_cmp; /* Print a stat entry */ int (*stat_show)(struct seq_file *s, void *p); /* Release an entry */ From 0c3c86bdc691c794a6154f8515b7fa82c82dfc4d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Thu, 10 Oct 2019 11:51:17 -0700 Subject: [PATCH 28/51] tracing/hwlat: Fix a few trivial nits Update the source file name in the comments, and fix a grammatical error. Link: http://lkml.kernel.org/r/157073346821.17189.8946944856026592247.stgit@srivatsa-ubuntu Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 63526670605a..6638d63f0921 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * trace_hwlatdetect.c - A simple Hardware Latency detector. + * trace_hwlat.c - A simple Hardware Latency detector. * * Use this tracer to detect large system latencies induced by the behavior of * certain underlying system hardware or firmware, independent of Linux itself. @@ -279,7 +279,7 @@ static void move_to_next_cpu(void) return; /* * If for some reason the user modifies the CPU affinity - * of this thread, than stop migrating for the duration + * of this thread, then stop migrating for the duration * of the current test. */ if (!cpumask_equal(current_mask, current->cpus_ptr)) From 6ee40511cb838f9ced002dff7131bca87e3ccbdd Mon Sep 17 00:00:00 2001 From: Yuming Han Date: Thu, 24 Oct 2019 11:34:30 +0800 Subject: [PATCH 29/51] tracing: use kvcalloc for tgid_map array allocation Fail to allocate memory for tgid_map, because it requires order-6 page. detail as: c3 sh: page allocation failure: order:6, mode:0x140c0c0(GFP_KERNEL), nodemask=(null) c3 sh cpuset=/ mems_allowed=0 c3 CPU: 3 PID: 5632 Comm: sh Tainted: G W O 4.14.133+ #10 c3 Hardware name: Generic DT based system c3 Backtrace: c3 [] (dump_backtrace) from [](show_stack+0x18/0x1c) c3 [] (show_stack) from [](dump_stack+0x84/0xa4) c3 [] (dump_stack) from [](warn_alloc+0xc4/0x19c) c3 [] (warn_alloc) from [](__alloc_pages_nodemask+0xd18/0xf28) c3 [] (__alloc_pages_nodemask) from [](kmalloc_order+0x20/0x38) c3 [] (kmalloc_order) from [](kmalloc_order_trace+0x24/0x108) c3 [] (kmalloc_order_trace) from [](set_tracer_flag+0xb0/0x158) c3 [] (set_tracer_flag) from [](trace_options_core_write+0x7c/0xcc) c3 [] (trace_options_core_write) from [](__vfs_write+0x40/0x14c) c3 [] (__vfs_write) from [](vfs_write+0xc4/0x198) c3 [] (vfs_write) from [](SyS_write+0x6c/0xd0) c3 [] (SyS_write) from [](ret_fast_syscall+0x0/0x54) Switch to use kvcalloc to avoid unexpected allocation failures. Link: http://lkml.kernel.org/r/1571888070-24425-1-git-send-email-chunyan.zhang@unisoc.com Signed-off-by: Yuming Han Signed-off-by: Chunyan Zhang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 79fe4d6ecbd8..42659ce6ac0c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4686,7 +4686,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_TGID) { if (!tgid_map) - tgid_map = kcalloc(PID_MAX_DEFAULT + 1, + tgid_map = kvcalloc(PID_MAX_DEFAULT + 1, sizeof(*tgid_map), GFP_KERNEL); if (!tgid_map) { From c7411a1a126f649be71526a36d4afac9e5aefa13 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 29 Oct 2019 17:31:44 +0900 Subject: [PATCH 30/51] tracing/kprobe: Check whether the non-suffixed symbol is notrace Check whether the non-suffixed symbol is notrace, since suffixed symbols are generated by the compilers for optimization. Based on these suffixed symbols, notrace check might not work because some of them are just a partial code of the original function. (e.g. cold-cache (unlikely) code is separated from original function as FUNCTION.cold.XX) For example, without this fix, # echo p device_add.cold.67 > /sys/kernel/debug/tracing/kprobe_events sh: write error: Invalid argument # cat /sys/kernel/debug/tracing/error_log [ 135.491035] trace_kprobe: error: Failed to register probe event Command: p device_add.cold.67 ^ # dmesg | tail -n 1 [ 135.488599] trace_kprobe: Could not probe notrace function device_add.cold.67 With this, # echo p device_add.cold.66 > /sys/kernel/debug/tracing/kprobe_events # cat /sys/kernel/debug/kprobes/list ffffffff81599de9 k device_add.cold.66+0x0 [DISABLED] Actually, kprobe blacklist already did similar thing, see within_kprobe_blacklist(). Link: http://lkml.kernel.org/r/157233790394.6706.18243942030937189679.stgit@devnote2 Fixes: 45408c4f9250 ("tracing: kprobes: Prohibit probing on notrace function") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1552a95c743b..7f890262c8a3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -435,11 +435,10 @@ static int disable_trace_kprobe(struct trace_event_call *call, #if defined(CONFIG_KPROBES_ON_FTRACE) && \ !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) -static bool within_notrace_func(struct trace_kprobe *tk) +static bool __within_notrace_func(unsigned long addr) { - unsigned long offset, size, addr; + unsigned long offset, size; - addr = trace_kprobe_address(tk); if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset)) return false; @@ -452,6 +451,28 @@ static bool within_notrace_func(struct trace_kprobe *tk) */ return !ftrace_location_range(addr, addr + size - 1); } + +static bool within_notrace_func(struct trace_kprobe *tk) +{ + unsigned long addr = addr = trace_kprobe_address(tk); + char symname[KSYM_NAME_LEN], *p; + + if (!__within_notrace_func(addr)) + return false; + + /* Check if the address is on a suffixed-symbol */ + if (!lookup_symbol_name(addr, symname)) { + p = strchr(symname, '.'); + if (!p) + return true; + *p = '\0'; + addr = (unsigned long)kprobe_lookup_name(symname, 0); + if (addr) + return __within_notrace_func(addr); + } + + return true; +} #else #define within_notrace_func(tk) (false) #endif From 353cade3149c27b53260932ee3ff1ebde405976d Mon Sep 17 00:00:00 2001 From: Piotr Maziarz Date: Thu, 7 Nov 2019 13:45:37 +0100 Subject: [PATCH 31/51] seq_buf: Add printing formatted hex dumps Provided function is an analogue of print_hex_dump(). Implementing this function in seq_buf allows using for multiple purposes (e.g. for tracing) and therefore prevents from code duplication in every layer that uses seq_buf. print_hex_dump() is an essential part of logging data to dmesg. Adding similar capability for other purposes is beneficial to all users. Example usage: seq_buf_hex_dump(seq, "", DUMP_PREFIX_OFFSET, 16, 4, buf, ARRAY_SIZE(buf), true); Example output: 00000000: 00000000 ffffff10 ffffff32 ffff3210 ........2....2.. 00000010: ffff3210 83d00437 c0700000 00000000 .2..7.....p..... 00000020: 02010004 0000000f 0000000f 00004002 .............@.. 00000030: 00000fff 00000000 ........ Link: http://lkml.kernel.org/r/1573130738-29390-1-git-send-email-piotrx.maziarz@linux.intel.com Signed-off-by: Piotr Maziarz Signed-off-by: Cezary Rojewski Signed-off-by: Steven Rostedt (VMware) --- include/linux/seq_buf.h | 3 ++ lib/seq_buf.c | 62 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index aa5deb041c25..fb0205d87d3c 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -125,6 +125,9 @@ extern int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len); extern int seq_buf_putmem_hex(struct seq_buf *s, const void *mem, unsigned int len); extern int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc); +extern int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii); #ifdef CONFIG_BINARY_PRINTF extern int diff --git a/lib/seq_buf.c b/lib/seq_buf.c index bd807f545a9d..4e865d42ab03 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -328,3 +328,65 @@ int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, int cnt) s->readpos += cnt; return cnt; } + +/** + * seq_buf_hex_dump - print formatted hex dump into the sequence buffer + * @s: seq_buf descriptor + * @prefix_str: string to prefix each line with; + * caller supplies trailing spaces for alignment if desired + * @prefix_type: controls whether prefix of an offset, address, or none + * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) + * @rowsize: number of bytes to print per line; must be 16 or 32 + * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) + * @buf: data blob to dump + * @len: number of bytes in the @buf + * @ascii: include ASCII after the hex output + * + * Function is an analogue of print_hex_dump() and thus has similar interface. + * + * linebuf size is maximal length for one line. + * 32 * 3 - maximum bytes per line, each printed into 2 chars + 1 for + * separating space + * 2 - spaces separating hex dump and ascii representation + * 32 - ascii representation + * 1 - terminating '\0' + * + * Returns zero on success, -1 on overflow + */ +int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type, + int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + const u8 *ptr = buf; + int i, linelen, remaining = len; + unsigned char linebuf[32 * 3 + 2 + 32 + 1]; + int ret; + + if (rowsize != 16 && rowsize != 32) + rowsize = 16; + + for (i = 0; i < len; i += rowsize) { + linelen = min(remaining, rowsize); + remaining -= rowsize; + + hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, + linebuf, sizeof(linebuf), ascii); + + switch (prefix_type) { + case DUMP_PREFIX_ADDRESS: + ret = seq_buf_printf(s, "%s%p: %s\n", + prefix_str, ptr + i, linebuf); + break; + case DUMP_PREFIX_OFFSET: + ret = seq_buf_printf(s, "%s%.8x: %s\n", + prefix_str, i, linebuf); + break; + default: + ret = seq_buf_printf(s, "%s%s\n", prefix_str, linebuf); + break; + } + if (ret) + return ret; + } + return 0; +} From ef56e047b2bd4dabb801fd073dfcab5f40de5f78 Mon Sep 17 00:00:00 2001 From: Piotr Maziarz Date: Thu, 7 Nov 2019 13:45:38 +0100 Subject: [PATCH 32/51] tracing: Use seq_buf_hex_dump() to dump buffers Without this, buffers can be printed with __print_array macro that has no formatting options and can be hard to read. The other way is to mimic formatting capability with multiple calls of trace event with one call per row which gives performance impact and different timestamp in each row. Link: http://lkml.kernel.org/r/1573130738-29390-2-git-send-email-piotrx.maziarz@linux.intel.com Signed-off-by: Piotr Maziarz Signed-off-by: Cezary Rojewski Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 5 +++++ include/linux/trace_seq.h | 4 ++++ include/trace/trace_events.h | 6 ++++++ kernel/trace/trace_output.c | 15 +++++++++++++++ kernel/trace/trace_seq.c | 30 ++++++++++++++++++++++++++++++ 5 files changed, 60 insertions(+) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 30a8cdcfd4a4..60a41b7069dd 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -45,6 +45,11 @@ const char *trace_print_array_seq(struct trace_seq *p, const void *buf, int count, size_t el_size); +const char * +trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii); + struct trace_iterator; struct trace_event; diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 6609b39a7232..6c30508fca19 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -92,6 +92,10 @@ extern int trace_seq_path(struct trace_seq *s, const struct path *path); extern void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, int nmaskbits); +extern int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii); + #else /* CONFIG_TRACING */ static inline void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 4ecdfe2e3580..7089760d4c7a 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -340,6 +340,12 @@ TRACE_MAKE_SYSTEM_STR(); trace_print_array_seq(p, array, count, el_size); \ }) +#undef __print_hex_dump +#define __print_hex_dump(prefix_str, prefix_type, \ + rowsize, groupsize, buf, len, ascii) \ + trace_print_hex_dump_seq(p, prefix_str, prefix_type, \ + rowsize, groupsize, buf, len, ascii) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace enum print_line_t \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d54ce252b05a..d9b4b7c22db4 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -274,6 +274,21 @@ trace_print_array_seq(struct trace_seq *p, const void *buf, int count, } EXPORT_SYMBOL(trace_print_array_seq); +const char * +trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_putc(p, '\n'); + trace_seq_hex_dump(p, prefix_str, prefix_type, + rowsize, groupsize, buf, len, ascii); + trace_seq_putc(p, 0); + return ret; +} +EXPORT_SYMBOL(trace_print_hex_dump_seq); + int trace_raw_output_prep(struct trace_iterator *iter, struct trace_event *trace_event) { diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 6b1c562ffdaf..344e4c1aa09c 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -376,3 +376,33 @@ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) return seq_buf_to_user(&s->seq, ubuf, cnt); } EXPORT_SYMBOL_GPL(trace_seq_to_user); + +int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return 0; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + seq_buf_hex_dump(&(s->seq), prefix_str, + prefix_type, rowsize, groupsize, + buf, len, ascii); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return 0; + } + + return 1; +} +EXPORT_SYMBOL(trace_seq_hex_dump); From 9b4712044d059e7842aaeeafd7c7a7ee88c589db Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 12 Nov 2019 18:42:19 +0100 Subject: [PATCH 33/51] tracing: Remove stray tab in TRACE_EVAL_MAP_FILE's help text There was a stray tab in the help text of the aforementioned config option which showed like this: The "print fmt" of the trace events will show the enum/sizeof names instead of their values. This can cause problems for user space tools ... in menuconfig. Remove it and end a sentence with a fullstop. No functional changes. Link: http://lkml.kernel.org/r/20191112174219.10933-1-bp@alien8.de Signed-off-by: Borislav Petkov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d25314bc7a1c..b872716bb2a0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -771,7 +771,7 @@ config TRACE_EVAL_MAP_FILE depends on TRACING help The "print fmt" of the trace events will show the enum/sizeof names - instead of their values. This can cause problems for user space tools + instead of their values. This can cause problems for user space tools that use this string to parse the raw data as user space does not know how to convert the string to its value. @@ -792,7 +792,7 @@ config TRACE_EVAL_MAP_FILE they are needed for the "eval_map" file. Enabling this option will increase the memory footprint of the running kernel. - If unsure, say N + If unsure, say N. config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" From 36b3615dc3b625c8b587f34e413a600f7ac16403 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Nov 2019 22:43:58 -0500 Subject: [PATCH 34/51] tracing: Add missing "inline" in stub function of latency_fsnotify() The latency_fsnotify() stub when the function is not defined, was missing the "inline". Link: https://lore.kernel.org/r/20191115140213.74c5efe7@canb.auug.org.au Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 90cba68c8b50..2df8aed6a8f0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -801,7 +801,7 @@ void latency_fsnotify(struct trace_array *tr); #else -static void latency_fsnotify(struct trace_array *tr) { } +static inline void latency_fsnotify(struct trace_array *tr) { } #endif From 0567d6809182df53da03636fad36c507c5cf07a5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Nov 2019 14:39:35 -0500 Subject: [PATCH 35/51] ftrace: Add modify_ftrace_direct() Add a new function modify_ftrace_direct() that will allow a user to update an existing direct caller to a new trampoline, without missing hits due to unregistering one and then adding another. Link: https://lore.kernel.org/r/20191109022907.6zzo6orhxpt5n2sv@ast-mbp.dhcp.thefacebook.com Suggested-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ++++ kernel/trace/ftrace.c | 78 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 55647e185141..73eb2e93593f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -250,6 +250,7 @@ static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { extern int ftrace_direct_func_count; int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); +int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr); struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); #else # define ftrace_direct_func_count 0 @@ -261,6 +262,11 @@ static inline int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { return -ENODEV; } +static inline int modify_ftrace_direct(unsigned long ip, + unsigned long old_addr, unsigned long new_addr) +{ + return -ENODEV; +} static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) { return NULL; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 82ef8d60a42b..834f3556ea1e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5160,6 +5160,84 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) return ret; } EXPORT_SYMBOL_GPL(unregister_ftrace_direct); + +static struct ftrace_ops stub_ops = { + .func = ftrace_stub, +}; + +/** + * modify_ftrace_direct - Modify an existing direct call to call something else + * @ip: The instruction pointer to modify + * @old_addr: The address that the current @ip calls directly + * @new_addr: The address that the @ip should call + * + * This modifies a ftrace direct caller at an instruction pointer without + * having to disable it first. The direct call will switch over to the + * @new_addr without missing anything. + * + * Returns: zero on success. Non zero on error, which includes: + * -ENODEV : the @ip given has no direct caller attached + * -EINVAL : the @old_addr does not match the current direct caller + */ +int modify_ftrace_direct(unsigned long ip, + unsigned long old_addr, unsigned long new_addr) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) { + /* OK if it is off by a little */ + rec = lookup_rec(ip, ip); + if (!rec || rec->ip == ip) + goto out_unlock; + + entry = __ftrace_lookup_ip(direct_functions, rec->ip); + if (!entry) + goto out_unlock; + + ip = rec->ip; + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); + } + + ret = -EINVAL; + if (entry->direct != old_addr) + goto out_unlock; + + /* + * By setting a stub function at the same address, we force + * the code to call the iterator and the direct_ops helper. + * This means that @ip does not call the direct call, and + * we can simply modify it. + */ + ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); + if (ret) + goto out_unlock; + + ret = register_ftrace_function(&stub_ops); + if (ret) { + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + goto out_unlock; + } + + entry->direct = new_addr; + + /* + * By removing the stub, we put back the direct call, calling + * the @new_addr. + */ + unregister_ftrace_function(&stub_ops); + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + + ret = 0; + + out_unlock: + mutex_unlock(&direct_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** From ae0cc3b7e7f5e37b946876e114068f92b33dfa5a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Nov 2019 14:41:47 -0500 Subject: [PATCH 36/51] ftrace/samples: Add a sample module that implements modify_ftrace_direct() Add a sample module that tests modify_ftrace_direct(), and this can be used by the selftests as well. Signed-off-by: Steven Rostedt (VMware) --- samples/ftrace/Makefile | 1 + samples/ftrace/ftrace-direct-modify.c | 88 +++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 samples/ftrace/ftrace-direct-modify.c diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile index d8217c4e072e..fb0c3ae18295 100644 --- a/samples/ftrace/Makefile +++ b/samples/ftrace/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c new file mode 100644 index 000000000000..e04229d21475 --- /dev/null +++ b/samples/ftrace/ftrace-direct-modify.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +void my_direct_func1(void) +{ + trace_printk("my direct func1\n"); +} + +void my_direct_func2(void) +{ + trace_printk("my direct func2\n"); +} + +extern void my_tramp1(void *); +extern void my_tramp2(void *); + +static unsigned long my_ip = (unsigned long)schedule; + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" my_tramp1:" +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" call my_direct_func1\n" +" leave\n" +" ret\n" +" my_tramp2:" +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" call my_direct_func2\n" +" leave\n" +" ret\n" +" .popsection\n" +); + +static unsigned long my_tramp = (unsigned long)my_tramp1; +static unsigned long tramps[2] = { + (unsigned long)my_tramp1, + (unsigned long)my_tramp2, +}; + +static int simple_thread(void *arg) +{ + static int t; + int ret = 0; + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(2 * HZ); + + if (ret) + continue; + t ^= 1; + ret = modify_ftrace_direct(my_ip, my_tramp, tramps[t]); + if (!ret) + my_tramp = tramps[t]; + WARN_ON_ONCE(ret); + } + + return 0; +} + +static struct task_struct *simple_tsk; + +static int __init ftrace_direct_init(void) +{ + int ret; + + ret = register_ftrace_direct(my_ip, my_tramp); + if (!ret) + simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn"); + return ret; +} + +static void __exit ftrace_direct_exit(void) +{ + kthread_stop(simple_tsk); + unregister_ftrace_direct(my_ip, my_tramp); +} + +module_init(ftrace_direct_init); +module_exit(ftrace_direct_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct()"); +MODULE_LICENSE("GPL"); From 58a74a2925a5b4125dd4f4e728490b9642534c81 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 15 Nov 2019 11:17:30 +0200 Subject: [PATCH 37/51] tracing: Increase SYNTH_FIELDS_MAX for synthetic_events Increase the maximum allowed count of synthetic event fields from 16 to 32 in order to allow for larger-than-usual events. Link: http://lkml.kernel.org/r/20191115091730.9192-1-dedekind1@gmail.com Reviewed-by: Tom Zanussi Signed-off-by: Artem Bityutskiy Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7482a1466ebf..f49d1a36d3ae 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -23,7 +23,7 @@ #include "trace_dynevent.h" #define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 16 +#define SYNTH_FIELDS_MAX 32 #define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ From 760f8bc7c89c87bff9c01a8746e85dcc6e771ea8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 15 Nov 2019 08:59:38 +0000 Subject: [PATCH 38/51] ftrace/selftests: Fix spelling mistake "wakeing" -> "waking" There is a spelling mistake in a trace_printk message. As well as in the selftests that search for this string. Link: http://lkml.kernel.org/r/20191115085938.38947-1-colin.king@canonical.com Link: http://lkml.kernel.org/r/20191115090356.39572-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- samples/ftrace/ftrace-direct.c | 2 +- tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc | 2 +- tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c index 1483f067b000..a2e3063bd306 100644 --- a/samples/ftrace/ftrace-direct.c +++ b/samples/ftrace/ftrace-direct.c @@ -6,7 +6,7 @@ void my_direct_func(struct task_struct *p) { - trace_printk("wakeing up %s-%d\n", p->comm, p->pid); + trace_printk("waking up %s-%d\n", p->comm, p->pid); } extern void my_tramp(void *); diff --git a/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc b/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc index cbc7a30c2e0f..d75a8695bc21 100644 --- a/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc +++ b/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc @@ -11,7 +11,7 @@ fi echo "Let the module run a little" sleep 1 -grep -q "my_direct_func: wakeing up" trace +grep -q "my_direct_func: waking up" trace rmmod ftrace-direct diff --git a/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc b/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc index 017a7409b103..801ecb63e84c 100644 --- a/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc +++ b/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc @@ -16,7 +16,7 @@ fi echo "Let the module run a little" sleep 1 -grep -q "my_direct_func: wakeing up" trace +grep -q "my_direct_func: waking up" trace rmmod ftrace-direct @@ -26,7 +26,7 @@ start_direct() { echo > trace modprobe ftrace-direct sleep 1 - grep -q "my_direct_func: wakeing up" trace + grep -q "my_direct_func: waking up" trace } stop_direct() { From 1c7f9b673dc0a15753274c4e7f5ebfd4468fc69f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:13:20 -0500 Subject: [PATCH 39/51] ftrace: Fix accounting bug with direct->count in register_ftrace_direct() The direct->count wasn't being updated properly, where it only was updated when the first entry was added, but should be updated every time. Fixes: 013bf0da04748 ("ftrace: Add ftrace_find_direct_func()") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 834f3556ea1e..32e4e5ffdd97 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5093,8 +5093,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ftrace_direct_func_count--; } } else { - if (!direct->count) - direct->count++; + direct->count++; } out_unlock: mutex_unlock(&direct_mutex); From 406acdd32d3e7d5a6dcb7f67798e89068fbe0d77 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:19:04 -0500 Subject: [PATCH 40/51] ftrace: Add another check for match in register_ftrace_direct() As an instruction pointer passed into register_ftrace_direct() may just exist on the ftrace call site, but may not be the start of the call site itself, register_ftrace_direct() still needs to update test if a direct call exists on the normalized site, as only one direct call is allowed at any one time. Fixes: 763e34e74bb7d ("ftrace: Add register_ftrace_direct()") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 32e4e5ffdd97..9fe33ebaf914 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5030,7 +5030,12 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) goto out_unlock; /* Make sure the ip points to the exact record */ - ip = rec->ip; + if (ip != rec->ip) { + ip = rec->ip; + /* Need to check this ip for a direct. */ + if (find_rec_direct(ip)) + goto out_unlock; + } ret = -ENOMEM; if (ftrace_hash_empty(direct_functions) || From 128161f47bc3797b0d068da13e311770685d6e4f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:14:45 -0500 Subject: [PATCH 41/51] ftrace: Add helper find_direct_entry() to consolidate code Both unregister_ftrace_direct() and modify_ftrace_direct() needs to normalize the ip passed in to match the rec->ip, as it is acceptable to have the ip on the ftrace call site but not the start. There are also common validity checks with the record found by the ip, these should be done for both unregister_ftrace_direct() and modify_ftrace_direct(). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 61 +++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9fe33ebaf914..ef79c8393f53 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5112,30 +5112,40 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } EXPORT_SYMBOL_GPL(register_ftrace_direct); -int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) { struct ftrace_func_entry *entry; - struct ftrace_direct_func *direct; struct dyn_ftrace *rec; + + rec = lookup_rec(*ip, *ip); + if (!rec) + return NULL; + + entry = __ftrace_lookup_ip(direct_functions, rec->ip); + if (!entry) { + WARN_ON(rec->flags & FTRACE_FL_DIRECT); + return NULL; + } + + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); + + /* Passed in ip just needs to be on the call site */ + *ip = rec->ip; + + return entry; +} + +int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_direct_func *direct; + struct ftrace_func_entry *entry; int ret = -ENODEV; mutex_lock(&direct_mutex); - entry = __ftrace_lookup_ip(direct_functions, ip); - if (!entry) { - /* OK if it is off by a little */ - rec = lookup_rec(ip, ip); - if (!rec || rec->ip == ip) - goto out_unlock; - - entry = __ftrace_lookup_ip(direct_functions, rec->ip); - if (!entry) { - WARN_ON(rec->flags & FTRACE_FL_DIRECT); - goto out_unlock; - } - - WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - } + entry = find_direct_entry(&ip); + if (!entry) + goto out_unlock; if (direct_functions->count == 1) unregister_ftrace_function(&direct_ops); @@ -5187,24 +5197,13 @@ int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { struct ftrace_func_entry *entry; - struct dyn_ftrace *rec; int ret = -ENODEV; mutex_lock(&direct_mutex); - entry = __ftrace_lookup_ip(direct_functions, ip); - if (!entry) { - /* OK if it is off by a little */ - rec = lookup_rec(ip, ip); - if (!rec || rec->ip == ip) - goto out_unlock; - entry = __ftrace_lookup_ip(direct_functions, rec->ip); - if (!entry) - goto out_unlock; - - ip = rec->ip; - WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - } + entry = find_direct_entry(&ip); + if (!entry) + goto out_unlock; ret = -EINVAL; if (entry->direct != old_addr) From ea806eb3eab35528b578a061b2c4b28f0f92c465 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 17 Nov 2019 17:04:15 -0500 Subject: [PATCH 42/51] ftrace: Add a helper function to modify_ftrace_direct() to allow arch optimization If a direct ftrace callback is at a location that does not have any other ftrace helpers attached to it, it is possible to simply just change the text to call the new caller (if the architecture supports it). But this requires special architecture code. Currently, modify_ftrace_direct() uses a trick to add a stub ftrace callback to the location forcing it to call the ftrace iterator. Then it can change the direct helper to call the new function in C, and then remove the stub. Removing the stub will have the location now call the new location that the direct helper is using. The new helper function does the registering the stub trick, but is a weak function, allowing an architecture to override it to do something a bit more direct. Link: https://lore.kernel.org/r/20191115215125.mbqv7taqnx376yed@ast-mbp.dhcp.thefacebook.com Suggested-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 21 +++++++- kernel/trace/ftrace.c | 120 ++++++++++++++++++++++++++++++----------- 2 files changed, 107 insertions(+), 34 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 73eb2e93593f..dfaa37e1943d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -246,12 +246,24 @@ static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } #endif /* CONFIG_FUNCTION_TRACER */ +struct ftrace_func_entry { + struct hlist_node hlist; + unsigned long ip; + unsigned long direct; /* for direct lookup only */ +}; + +struct dyn_ftrace; + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS extern int ftrace_direct_func_count; int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr); struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); +int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, + struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long new_addr); #else # define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) @@ -271,6 +283,13 @@ static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long a { return NULL; } +static inline int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, + struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long new_addr) +{ + return -ENODEV; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -343,8 +362,6 @@ static inline void stack_tracer_enable(void) { } int ftrace_arch_code_modify_prepare(void); int ftrace_arch_code_modify_post_process(void); -struct dyn_ftrace; - enum ftrace_bug_type { FTRACE_BUG_UNKNOWN, FTRACE_BUG_INIT, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ef79c8393f53..caae523f4ef3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1020,12 +1020,6 @@ static bool update_all_ops; # error Dynamic ftrace depends on MCOUNT_RECORD #endif -struct ftrace_func_entry { - struct hlist_node hlist; - unsigned long ip; - unsigned long direct; /* for direct lookup only */ -}; - struct ftrace_func_probe { struct ftrace_probe_ops *probe_ops; struct ftrace_ops ops; @@ -5112,7 +5106,8 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } EXPORT_SYMBOL_GPL(register_ftrace_direct); -static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) +static struct ftrace_func_entry *find_direct_entry(unsigned long *ip, + struct dyn_ftrace **recp) { struct ftrace_func_entry *entry; struct dyn_ftrace *rec; @@ -5132,6 +5127,9 @@ static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) /* Passed in ip just needs to be on the call site */ *ip = rec->ip; + if (recp) + *recp = rec; + return entry; } @@ -5143,7 +5141,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) mutex_lock(&direct_mutex); - entry = find_direct_entry(&ip); + entry = find_direct_entry(&ip, NULL); if (!entry) goto out_unlock; @@ -5179,6 +5177,75 @@ static struct ftrace_ops stub_ops = { .func = ftrace_stub, }; +/** + * ftrace_modify_direct_caller - modify ftrace nop directly + * @entry: The ftrace hash entry of the direct helper for @rec + * @rec: The record representing the function site to patch + * @old_addr: The location that the site at @rec->ip currently calls + * @new_addr: The location that the site at @rec->ip should call + * + * An architecture may overwrite this function to optimize the + * changing of the direct callback on an ftrace nop location. + * This is called with the ftrace_lock mutex held, and no other + * ftrace callbacks are on the associated record (@rec). Thus, + * it is safe to modify the ftrace record, where it should be + * currently calling @old_addr directly, to call @new_addr. + * + * Safety checks should be made to make sure that the code at + * @rec->ip is currently calling @old_addr. And this must + * also update entry->direct to @new_addr. + */ +int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, + struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long new_addr) +{ + unsigned long ip = rec->ip; + int ret; + + /* + * The ftrace_lock was used to determine if the record + * had more than one registered user to it. If it did, + * we needed to prevent that from changing to do the quick + * switch. But if it did not (only a direct caller was attached) + * then this function is called. But this function can deal + * with attached callers to the rec that we care about, and + * since this function uses standard ftrace calls that take + * the ftrace_lock mutex, we need to release it. + */ + mutex_unlock(&ftrace_lock); + + /* + * By setting a stub function at the same address, we force + * the code to call the iterator and the direct_ops helper. + * This means that @ip does not call the direct call, and + * we can simply modify it. + */ + ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); + if (ret) + goto out_lock; + + ret = register_ftrace_function(&stub_ops); + if (ret) { + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + goto out_lock; + } + + entry->direct = new_addr; + + /* + * By removing the stub, we put back the direct call, calling + * the @new_addr. + */ + unregister_ftrace_function(&stub_ops); + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + + out_lock: + mutex_lock(&ftrace_lock); + + return ret; +} + /** * modify_ftrace_direct - Modify an existing direct call to call something else * @ip: The instruction pointer to modify @@ -5197,11 +5264,13 @@ int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; int ret = -ENODEV; mutex_lock(&direct_mutex); - entry = find_direct_entry(&ip); + mutex_lock(&ftrace_lock); + entry = find_direct_entry(&ip, &rec); if (!entry) goto out_unlock; @@ -5210,33 +5279,20 @@ int modify_ftrace_direct(unsigned long ip, goto out_unlock; /* - * By setting a stub function at the same address, we force - * the code to call the iterator and the direct_ops helper. - * This means that @ip does not call the direct call, and - * we can simply modify it. + * If there's no other ftrace callback on the rec->ip location, + * then it can be changed directly by the architecture. + * If there is another caller, then we just need to change the + * direct caller helper to point to @new_addr. */ - ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); - if (ret) - goto out_unlock; - - ret = register_ftrace_function(&stub_ops); - if (ret) { - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - goto out_unlock; + if (ftrace_rec_count(rec) == 1) { + ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr); + } else { + entry->direct = new_addr; + ret = 0; } - entry->direct = new_addr; - - /* - * By removing the stub, we put back the direct call, calling - * the @new_addr. - */ - unregister_ftrace_function(&stub_ops); - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - - ret = 0; - out_unlock: + mutex_unlock(&ftrace_lock); mutex_unlock(&direct_mutex); return ret; } From 46f9469247c6f4697cbbf37e4b3961120bf07f29 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 18 Nov 2019 10:41:29 -0500 Subject: [PATCH 43/51] ftrace: Rename ftrace_graph_stub to ftrace_stub_graph The ftrace_graph_stub was created and points to ftrace_stub as a way to assign the functon graph tracer function pointer to a stub function with a different prototype than what ftrace_stub has and not trigger the C verifier. The ftrace_graph_stub was created via the linker script vmlinux.lds.h. Unfortunately, powerpc already uses the name ftrace_graph_stub for its internal implementation of the function graph tracer, and even though powerpc would still build, the change via the linker script broke function tracer on powerpc from working. By using the name ftrace_stub_graph, which does not exist anywhere else in the kernel, this should not be a problem. Link: https://lore.kernel.org/r/1573849732.5937.136.camel@lca.pw Fixes: b83b43ffc6e4 ("fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub") Reorted-by: Qian Cai Signed-off-by: Steven Rostedt (VMware) --- include/asm-generic/vmlinux.lds.h | 8 ++++---- kernel/trace/fgraph.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0f358be551cd..996db32c491b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -112,7 +112,7 @@ #ifdef CONFIG_FTRACE_MCOUNT_RECORD #ifdef CC_USING_PATCHABLE_FUNCTION_ENTRY /* - * Need to also make ftrace_graph_stub point to ftrace_stub + * Need to also make ftrace_stub_graph point to ftrace_stub * so that the same stub location may have different protocols * and not mess up with C verifiers. */ @@ -120,17 +120,17 @@ __start_mcount_loc = .; \ KEEP(*(__patchable_function_entries)) \ __stop_mcount_loc = .; \ - ftrace_graph_stub = ftrace_stub; + ftrace_stub_graph = ftrace_stub; #else #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__mcount_loc)) \ __stop_mcount_loc = .; \ - ftrace_graph_stub = ftrace_stub; + ftrace_stub_graph = ftrace_stub; #endif #else # ifdef CONFIG_FUNCTION_TRACER -# define MCOUNT_REC() ftrace_graph_stub = ftrace_stub; +# define MCOUNT_REC() ftrace_stub_graph = ftrace_stub; # else # define MCOUNT_REC() # endif diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index fa3ce10d0405..67e0c462b059 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -336,10 +336,10 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h */ -extern void ftrace_graph_stub(struct ftrace_graph_ret *); +extern void ftrace_stub_graph(struct ftrace_graph_ret *); /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -619,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) goto out; ftrace_graph_active--; - ftrace_graph_return = ftrace_graph_stub; + ftrace_graph_return = ftrace_stub_graph; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); From eb01fedc3d539f9443082aa2384c5d1ca26ed5c1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 20 Nov 2019 18:32:25 -0500 Subject: [PATCH 44/51] ftrace: Return ENOTSUPP when DYNAMIC_FTRACE_WITH_DIRECT_CALLS is not configured When CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS is not set it's best to have the stub functions return ENOTSUPP instead of ENODEV, otherwise ENODEV is a valid error when ip is incorrect which is indistinguishable from ftrace not compiled in. Link: http://lkml.kernel.org/r/CAADnVQ+OzTikM9EhrfsC7NFsVYhATW1SVHxK64w3xn9qpk81pg@mail.gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index dfaa37e1943d..af79b0f8cdc1 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -268,16 +268,16 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, # define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) { - return -ENODEV; + return -ENOTSUPP; } static inline int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { - return -ENODEV; + return -ENOTSUPP; } static inline int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { - return -ENODEV; + return -ENOTSUPP; } static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) { From b41db132821fdad9d80a177344a47468791fbe62 Mon Sep 17 00:00:00 2001 From: "Enrico Weigelt, metux IT consult" Date: Thu, 21 Nov 2019 14:38:15 +0100 Subject: [PATCH 45/51] ftrace: Use BIT() macro It's cleaner to use the BIT() macro instead of raw shift operation. Link: http://lkml.kernel.org/r/20191121133815.15040-1-info@metux.net Signed-off-by: Enrico Weigelt, metux IT consult [ Added BIT() for bits 16 and 17 ] Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index af79b0f8cdc1..232806d5689d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -149,24 +149,24 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * (internal ftrace only, should not be used by others) */ enum { - FTRACE_OPS_FL_ENABLED = 1 << 0, - FTRACE_OPS_FL_DYNAMIC = 1 << 1, - FTRACE_OPS_FL_SAVE_REGS = 1 << 2, - FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 3, - FTRACE_OPS_FL_RECURSION_SAFE = 1 << 4, - FTRACE_OPS_FL_STUB = 1 << 5, - FTRACE_OPS_FL_INITIALIZED = 1 << 6, - FTRACE_OPS_FL_DELETED = 1 << 7, - FTRACE_OPS_FL_ADDING = 1 << 8, - FTRACE_OPS_FL_REMOVING = 1 << 9, - FTRACE_OPS_FL_MODIFYING = 1 << 10, - FTRACE_OPS_FL_ALLOC_TRAMP = 1 << 11, - FTRACE_OPS_FL_IPMODIFY = 1 << 12, - FTRACE_OPS_FL_PID = 1 << 13, - FTRACE_OPS_FL_RCU = 1 << 14, - FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, - FTRACE_OPS_FL_PERMANENT = 1 << 16, - FTRACE_OPS_FL_DIRECT = 1 << 17, + FTRACE_OPS_FL_ENABLED = BIT(0), + FTRACE_OPS_FL_DYNAMIC = BIT(1), + FTRACE_OPS_FL_SAVE_REGS = BIT(2), + FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = BIT(3), + FTRACE_OPS_FL_RECURSION_SAFE = BIT(4), + FTRACE_OPS_FL_STUB = BIT(5), + FTRACE_OPS_FL_INITIALIZED = BIT(6), + FTRACE_OPS_FL_DELETED = BIT(7), + FTRACE_OPS_FL_ADDING = BIT(8), + FTRACE_OPS_FL_REMOVING = BIT(9), + FTRACE_OPS_FL_MODIFYING = BIT(10), + FTRACE_OPS_FL_ALLOC_TRAMP = BIT(11), + FTRACE_OPS_FL_IPMODIFY = BIT(12), + FTRACE_OPS_FL_PID = BIT(13), + FTRACE_OPS_FL_RCU = BIT(14), + FTRACE_OPS_FL_TRACE_ARRAY = BIT(15), + FTRACE_OPS_FL_PERMANENT = BIT(16), + FTRACE_OPS_FL_DIRECT = BIT(17), }; #ifdef CONFIG_DYNAMIC_FTRACE From a82a4804b4ee3636c8988fea14d44f70f4de45f1 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Sat, 16 Nov 2019 10:05:55 -0500 Subject: [PATCH 46/51] ring-buffer: Fix typos in function ring_buffer_producer Fix spelling and other typos Link: http://lkml.kernel.org/r/1573916755-32478-1-git-send-email-xianting_tian@126.com Signed-off-by: Xianting Tian Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer_benchmark.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 09b0b49f346e..32149e46551c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -269,10 +269,10 @@ static void ring_buffer_producer(void) #ifndef CONFIG_PREEMPTION /* - * If we are a non preempt kernel, the 10 second run will + * If we are a non preempt kernel, the 10 seconds run will * stop everything while it runs. Instead, we will call * cond_resched and also add any time that was lost by a - * rescedule. + * reschedule. * * Do a cond resched at the same frequency we would wake up * the reader. From fc809bc5ceaa665497384064ab2d76713c774bad Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 20 Nov 2019 21:38:07 +0800 Subject: [PATCH 47/51] tracing: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Link: http://lkml.kernel.org/r/20191120133807.12741-1-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b872716bb2a0..f67620499faa 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -79,7 +79,7 @@ config FTRACE_NMI_ENTER config EVENT_TRACING select CONTEXT_SWITCH_TRACER - select GLOB + select GLOB bool config CONTEXT_SWITCH_TRACER @@ -311,7 +311,7 @@ config TRACER_SNAPSHOT cat snapshot config TRACER_SNAPSHOT_PER_CPU_SWAP - bool "Allow snapshot to swap per CPU" + bool "Allow snapshot to swap per CPU" depends on TRACER_SNAPSHOT select RING_BUFFER_ALLOW_SWAP help @@ -683,7 +683,7 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. config TRACEPOINT_BENCHMARK - bool "Add tracepoint that benchmarks tracepoints" + bool "Add tracepoint that benchmarks tracepoints" help This option creates the tracepoint "benchmark:benchmark_event". When the tracepoint is enabled, it kicks off a kernel thread that @@ -732,7 +732,7 @@ config RING_BUFFER_STARTUP_TEST bool "Ring buffer startup self test" depends on RING_BUFFER help - Run a simple self test on the ring buffer on boot up. Late in the + Run a simple self test on the ring buffer on boot up. Late in the kernel boot sequence, the test will start that kicks off a thread per cpu. Each thread will write various size events into the ring buffer. Another thread is created to send IPIs From 28879787147358e8ffcae397f11748de3dd26577 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 20 Nov 2019 11:08:38 -0800 Subject: [PATCH 48/51] tracing: Adding new functions for kernel access to Ftrace instances Adding 2 new functions - 1) struct trace_array *trace_array_get_by_name(const char *name); Return pointer to a trace array with given name. If it does not exist, create and return pointer to the new trace array. 2) int trace_array_set_clr_event(struct trace_array *tr, const char *system ,const char *event, bool enable); Enable/Disable events to this trace array. Additionally, - To handle reference counters, export trace_array_put() - Due to introduction of the above 2 new functions, we no longer need to export - ftrace_set_clr_event & trace_array_create APIs. Link: http://lkml.kernel.org/r/1574276919-11119-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Reviewed-by: Aruna Ramakrishna Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace.h | 3 +- include/linux/trace_events.h | 3 +- kernel/trace/trace.c | 96 ++++++++++++++++++++++++++++-------- kernel/trace/trace.h | 1 - kernel/trace/trace_events.c | 27 +++++++++- 5 files changed, 106 insertions(+), 24 deletions(-) diff --git a/include/linux/trace.h b/include/linux/trace.h index 24fcf07812ae..7fd86d3c691f 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -29,7 +29,8 @@ struct trace_array; void trace_printk_init_buffers(void); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); -struct trace_array *trace_array_create(const char *name); +void trace_array_put(struct trace_array *tr); +struct trace_array *trace_array_get_by_name(const char *name); int trace_array_destroy(struct trace_array *tr); #endif /* CONFIG_TRACING */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 60a41b7069dd..4c6e15605766 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -555,7 +555,8 @@ extern int trace_event_get_offsets(struct trace_event_call *call); int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); - +int trace_array_set_clr_event(struct trace_array *tr, const char *system, + const char *event, bool enable); /* * The double __builtin_constant_p is because gcc will give us an error * if we try to allocate the static variable to fmt if it is not a diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42659ce6ac0c..02a23a6e5e00 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -301,12 +301,24 @@ static void __trace_array_put(struct trace_array *this_tr) this_tr->ref--; } +/** + * trace_array_put - Decrement the reference counter for this trace array. + * + * NOTE: Use this when we no longer need the trace array returned by + * trace_array_get_by_name(). This ensures the trace array can be later + * destroyed. + * + */ void trace_array_put(struct trace_array *this_tr) { + if (!this_tr) + return; + mutex_lock(&trace_types_lock); __trace_array_put(this_tr); mutex_unlock(&trace_types_lock); } +EXPORT_SYMBOL_GPL(trace_array_put); int tracing_check_open_get_tr(struct trace_array *tr) { @@ -8437,24 +8449,15 @@ static void update_tracer_options(struct trace_array *tr) mutex_unlock(&trace_types_lock); } -struct trace_array *trace_array_create(const char *name) +static struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); - - ret = -EEXIST; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) - goto out_unlock; - } - ret = -ENOMEM; tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) - goto out_unlock; + return ERR_PTR(ret); tr->name = kstrdup(name, GFP_KERNEL); if (!tr->name) @@ -8499,8 +8502,8 @@ struct trace_array *trace_array_create(const char *name) list_add(&tr->list, &ftrace_trace_arrays); - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); + tr->ref++; + return tr; @@ -8510,24 +8513,77 @@ struct trace_array *trace_array_create(const char *name) kfree(tr->name); kfree(tr); - out_unlock: - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); - return ERR_PTR(ret); } -EXPORT_SYMBOL_GPL(trace_array_create); static int instance_mkdir(const char *name) { - return PTR_ERR_OR_ZERO(trace_array_create(name)); + struct trace_array *tr; + int ret; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -EEXIST; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + tr = trace_array_create(name); + + ret = PTR_ERR_OR_ZERO(tr); + +out_unlock: + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return ret; } +/** + * trace_array_get_by_name - Create/Lookup a trace array, given its name. + * @name: The name of the trace array to be looked up/created. + * + * Returns pointer to trace array with given name. + * NULL, if it cannot be created. + * + * NOTE: This function increments the reference counter associated with the + * trace array returned. This makes sure it cannot be freed while in use. + * Use trace_array_put() once the trace array is no longer needed. + * + */ +struct trace_array *trace_array_get_by_name(const char *name) +{ + struct trace_array *tr; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + tr = trace_array_create(name); + + if (IS_ERR(tr)) + tr = NULL; +out_unlock: + if (tr) + tr->ref++; + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return tr; +} +EXPORT_SYMBOL_GPL(trace_array_get_by_name); + static int __remove_instance(struct trace_array *tr) { int i; - if (tr->ref || (tr->current_trace && tr->current_trace->ref)) + /* Reference counter for a newly created trace array = 1. */ + if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref)) return -EBUSY; list_del(&tr->list); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2df8aed6a8f0..ca7fccafbcbb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -345,7 +345,6 @@ extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); -extern void trace_array_put(struct trace_array *tr); extern int tracing_check_open_get_tr(struct trace_array *tr); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2a3ac2365445..6b3a69e9aa6a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -827,7 +827,6 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) return ret; } -EXPORT_SYMBOL_GPL(ftrace_set_clr_event); /** * trace_set_clr_event - enable or disable an event @@ -852,6 +851,32 @@ int trace_set_clr_event(const char *system, const char *event, int set) } EXPORT_SYMBOL_GPL(trace_set_clr_event); +/** + * trace_array_set_clr_event - enable or disable an event for a trace array. + * @tr: concerned trace array. + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @enable: true to enable, false to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_array_set_clr_event(struct trace_array *tr, const char *system, + const char *event, bool enable) +{ + int set; + + if (!tr) + return -ENOENT; + + set = (enable == true) ? 1 : 0; + return __ftrace_set_clr_event(tr, NULL, system, event, set); +} +EXPORT_SYMBOL_GPL(trace_array_set_clr_event); + /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 From 89ed42495ef4a020435f2b999093bb5731eeb8b9 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 20 Nov 2019 11:08:39 -0800 Subject: [PATCH 49/51] tracing: Sample module to demonstrate kernel access to Ftrace instances. This is a sample module to demonstrate the use of the newly introduced and exported APIs to access Ftrace instances from within the kernel. Newly introduced APIs used here - 1. Create/Lookup a trace array with the given name. struct trace_array *trace_array_get_by_name(const char *name) 2. Destroy/Remove a trace array. int trace_array_destroy(struct trace_array *tr) 4. Enable/Disable trace events: int trace_array_set_clr_event(struct trace_array *tr, const char *system, const char *event, bool enable); Exported APIs - 1. trace_printk equivalent for instances. int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); 2. Helper function. void trace_printk_init_buffers(void); 3. To decrement the reference counter. void trace_array_put(struct trace_array *tr) Sample output(contents of /sys/kernel/tracing/instances/sample-instance) NOTE: Tracing disabled after ~5 sec) _-----=> irqs-off / _----=> need-resched | / _---=> hardirq/softirq || / _--=> preempt-depth ||| / delay TASK-PID CPU# |||| TIMESTAMP FUNCTION | | | |||| | | sample-instance-1452 [002] .... 49.430948: simple_thread: trace_array_printk: count=0 sample-instance-1452 [002] .... 49.430951: sample_event: count value=0 at jiffies=4294716608 sample-instance-1452 [002] .... 50.454847: simple_thread: trace_array_printk: count=1 sample-instance-1452 [002] .... 50.454849: sample_event: count value=1 at jiffies=4294717632 sample-instance-1452 [002] .... 51.478748: simple_thread: trace_array_printk: count=2 sample-instance-1452 [002] .... 51.478750: sample_event: count value=2 at jiffies=4294718656 sample-instance-1452 [002] .... 52.502652: simple_thread: trace_array_printk: count=3 sample-instance-1452 [002] .... 52.502655: sample_event: count value=3 at jiffies=4294719680 sample-instance-1452 [002] .... 53.526533: simple_thread: trace_array_printk: count=4 sample-instance-1452 [002] .... 53.526535: sample_event: count value=4 at jiffies=4294720704 sample-instance-1452 [002] .... 54.550438: simple_thread: trace_array_printk: count=5 sample-instance-1452 [002] .... 55.574336: simple_thread: trace_array_printk: count=6 Link: http://lkml.kernel.org/r/1574276919-11119-3-git-send-email-divya.indi@oracle.com Reviewed-by: Aruna Ramakrishna Signed-off-by: Divya Indi [ Moved to samples/ftrace ] Signed-off-by: Steven Rostedt (VMware) --- samples/Kconfig | 7 ++ samples/Makefile | 1 + samples/ftrace/Makefile | 3 + samples/ftrace/sample-trace-array.c | 131 ++++++++++++++++++++++++++++ samples/ftrace/sample-trace-array.h | 84 ++++++++++++++++++ 5 files changed, 226 insertions(+) create mode 100644 samples/ftrace/sample-trace-array.c create mode 100644 samples/ftrace/sample-trace-array.h diff --git a/samples/Kconfig b/samples/Kconfig index 65b5967dac0c..ad64b223f3d0 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -27,6 +27,13 @@ config SAMPLE_FTRACE_DIRECT This builds an ftrace direct function example that hooks to wake_up_process and prints the parameters. +config SAMPLE_TRACE_ARRAY + tristate "Build sample module for kernel access to Ftrace instancess" + depends on EVENT_TRACING && m + help + This builds a module that demonstrates the use of various APIs to + access Ftrace instances from within the kernel. + config SAMPLE_KOBJECT tristate "Build kobject examples" help diff --git a/samples/Makefile b/samples/Makefile index cd496d633370..297f8934773f 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -18,6 +18,7 @@ subdir-$(CONFIG_SAMPLE_SECCOMP) += seccomp obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/ obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ +obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/ obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ obj-y += vfio-mdev/ subdir-$(CONFIG_SAMPLE_VFS) += vfs diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile index fb0c3ae18295..4ce896e10b2e 100644 --- a/samples/ftrace/Makefile +++ b/samples/ftrace/Makefile @@ -3,3 +3,6 @@ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o + +CFLAGS_sample-trace-array.o := -I$(src) +obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c new file mode 100644 index 000000000000..d523450d73eb --- /dev/null +++ b/samples/ftrace/sample-trace-array.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +/* + * Any file that uses trace points, must include the header. + * But only one file, must include the header by defining + * CREATE_TRACE_POINTS first. This will make the C code that + * creates the handles for the trace points. + */ +#define CREATE_TRACE_POINTS +#include "sample-trace-array.h" + +struct trace_array *tr; +static void mytimer_handler(struct timer_list *unused); +static struct task_struct *simple_tsk; + +/* + * mytimer: Timer setup to disable tracing for event "sample_event". This + * timer is only for the purposes of the sample module to demonstrate access of + * Ftrace instances from within kernel. + */ +static DEFINE_TIMER(mytimer, mytimer_handler); + +static void mytimer_handler(struct timer_list *unused) +{ + /* + * Disable tracing for event "sample_event". + */ + trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", + false); +} + +static void simple_thread_func(int count) +{ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + + /* + * Printing count value using trace_array_printk() - trace_printk() + * equivalent for the instance buffers. + */ + trace_array_printk(tr, _THIS_IP_, "trace_array_printk: count=%d\n", + count); + /* + * Tracepoint for event "sample_event". This will print the + * current value of count and current jiffies. + */ + trace_sample_event(count, jiffies); +} + +static int simple_thread(void *arg) +{ + int count = 0; + unsigned long delay = msecs_to_jiffies(5000); + + /* + * Enable tracing for "sample_event". + */ + trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", true); + + /* + * Adding timer - mytimer. This timer will disable tracing after + * delay seconds. + * + */ + add_timer(&mytimer); + mod_timer(&mytimer, jiffies+delay); + + while (!kthread_should_stop()) + simple_thread_func(count++); + + del_timer(&mytimer); + + /* + * trace_array_put() decrements the reference counter associated with + * the trace array - "tr". We are done using the trace array, hence + * decrement the reference counter so that it can be destroyed using + * trace_array_destroy(). + */ + trace_array_put(tr); + + return 0; +} + +static int __init sample_trace_array_init(void) +{ + /* + * Return a pointer to the trace array with name "sample-instance" if it + * exists, else create a new trace array. + * + * NOTE: This function increments the reference counter + * associated with the trace array - "tr". + */ + tr = trace_array_get_by_name("sample-instance"); + + if (!tr) + return -1; + /* + * If context specific per-cpu buffers havent already been allocated. + */ + trace_printk_init_buffers(); + + simple_tsk = kthread_run(simple_thread, NULL, "sample-instance"); + if (IS_ERR(simple_tsk)) + return -1; + return 0; +} + +static void __exit sample_trace_array_exit(void) +{ + kthread_stop(simple_tsk); + + /* + * We are unloading our module and no longer require the trace array. + * Remove/destroy "tr" using trace_array_destroy() + */ + trace_array_destroy(tr); +} + +module_init(sample_trace_array_init); +module_exit(sample_trace_array_exit); + +MODULE_AUTHOR("Divya Indi"); +MODULE_DESCRIPTION("Sample module for kernel access to Ftrace instances"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/sample-trace-array.h b/samples/ftrace/sample-trace-array.h new file mode 100644 index 000000000000..6f8962428158 --- /dev/null +++ b/samples/ftrace/sample-trace-array.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * If TRACE_SYSTEM is defined, that will be the directory created + * in the ftrace directory under /sys/kernel/tracing/events/ + * + * The define_trace.h below will also look for a file name of + * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here. + * In this case, it would look for sample-trace.h + * + * If the header name will be different than the system name + * (as in this case), then you can override the header name that + * define_trace.h will look up by defining TRACE_INCLUDE_FILE + * + * This file is called sample-trace-array.h but we want the system + * to be called "sample-subsystem". Therefore we must define the name of this + * file: + * + * #define TRACE_INCLUDE_FILE sample-trace-array + * + * As we do in the bottom of this file. + * + * Notice that TRACE_SYSTEM should be defined outside of #if + * protection, just like TRACE_INCLUDE_FILE. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sample-subsystem + +/* + * TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric + * and underscore), although it may start with numbers. If for some + * reason it is not, you need to add the following lines: + */ +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR sample_subsystem + +/* + * But the above is only needed if TRACE_SYSTEM is not alpha-numeric + * and underscored. By default, TRACE_SYSTEM_VAR will be equal to + * TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if + * TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with + * only alpha-numeric and underscores. + * + * The TRACE_SYSTEM_VAR is only used internally and not visible to + * user space. + */ + +/* + * Notice that this file is not protected like a normal header. + * We also must allow for rereading of this file. The + * + * || defined(TRACE_HEADER_MULTI_READ) + * + * serves this purpose. + */ +#if !defined(_SAMPLE_TRACE_ARRAY_H) || defined(TRACE_HEADER_MULTI_READ) +#define _SAMPLE_TRACE_ARRAY_H + +#include +TRACE_EVENT(sample_event, + + TP_PROTO(int count, unsigned long time), + + TP_ARGS(count, time), + + TP_STRUCT__entry( + __field(int, count) + __field(unsigned long, time) + ), + + TP_fast_assign( + __entry->count = count; + __entry->time = time; + ), + + TP_printk("count value=%d at jiffies=%lu", __entry->count, + __entry->time) + ); +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE sample-trace-array +#include From 0e24220821b0e0e330a18bfef29ac6396545d62e Mon Sep 17 00:00:00 2001 From: Hassan Naveed Date: Fri, 15 Nov 2019 23:44:42 +0000 Subject: [PATCH 50/51] tracing: Use xarray for syscall trace events Currently, a lot of memory is wasted for architectures like MIPS when init_ftrace_syscalls() allocates the array for syscalls using kcalloc. This is because syscalls numbers start from 4000, 5000 or 6000 and array elements up to that point are unused. Fix this by using a data structure more suited to storing sparsely populated arrays. The XARRAY data structure, implemented using radix trees, is much more memory efficient for storing the syscalls in question. Link: http://lkml.kernel.org/r/20191115234314.21599-1-hnaveed@wavecomp.com Signed-off-by: Hassan Naveed Reviewed-by: Paul Burton Signed-off-by: Steven Rostedt (VMware) --- arch/Kconfig | 8 ++++++++ kernel/trace/trace_syscalls.c | 30 ++++++++++++++++++++++++------ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 5f8a5d84dbbe..69c87e8608d8 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -960,6 +960,14 @@ config RELR config ARCH_HAS_MEM_ENCRYPT bool +config HAVE_SPARSE_SYSCALL_NR + bool + help + An architecture should select this if its syscall numbering is sparse + to save space. For example, MIPS architecture has a syscall array with + entries at 4000, 5000 and 6000 locations. This option turns on syscall + related optimizations for a given architecture. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index fa8fbff736d6..16fa218556fa 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -7,6 +7,7 @@ #include /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ #include #include +#include #include #include "trace_output.h" @@ -30,6 +31,7 @@ syscall_get_enter_fields(struct trace_event_call *call) extern struct syscall_metadata *__start_syscalls_metadata[]; extern struct syscall_metadata *__stop_syscalls_metadata[]; +static DEFINE_XARRAY(syscalls_metadata_sparse); static struct syscall_metadata **syscalls_metadata; #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME @@ -101,6 +103,9 @@ find_syscall_meta(unsigned long syscall) static struct syscall_metadata *syscall_nr_to_meta(int nr) { + if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) + return xa_load(&syscalls_metadata_sparse, (unsigned long)nr); + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; @@ -536,12 +541,16 @@ void __init init_ftrace_syscalls(void) struct syscall_metadata *meta; unsigned long addr; int i; + void *ret; - syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), - GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return; + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata = kcalloc(NR_syscalls, + sizeof(*syscalls_metadata), + GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return; + } } for (i = 0; i < NR_syscalls; i++) { @@ -551,7 +560,16 @@ void __init init_ftrace_syscalls(void) continue; meta->syscall_nr = i; - syscalls_metadata[i] = meta; + + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata[i] = meta; + } else { + ret = xa_store(&syscalls_metadata_sparse, i, meta, + GFP_KERNEL); + WARN(xa_is_err(ret), + "Syscall memory allocation failed\n"); + } + } } From 16c0f03f629a89e6a1249497202b2c154ff46206 Mon Sep 17 00:00:00 2001 From: Hassan Naveed Date: Fri, 15 Nov 2019 23:44:49 +0000 Subject: [PATCH 51/51] tracing: Enable syscall optimization for MIPS Since MIPS architecture has a sparse syscall array, select the HAVE_SPARSE_SYSCALL_NR to save space. Link: http://lkml.kernel.org/r/20191115234314.21599-2-hnaveed@wavecomp.com Signed-off-by: Hassan Naveed Reviewed-by: Paul Burton Signed-off-by: Steven Rostedt (VMware) --- arch/mips/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index a0bd9bdb5f83..c7f59c4a00d8 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -73,6 +73,7 @@ config MIPS select HAVE_PERF_EVENTS select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ + select HAVE_SPARSE_SYSCALL_NR select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP