Historically, kretprobe has always produced unusable stack traces (kretprobe_trampoline is the only entry in most cases, because of the funky stack pointer overwriting). This has caused quite a few annoyances when using tracing to debug problems[1] -- since return values are only available with kretprobes but stack traces were only usable for kprobes, users had to probe both and then manually associate them.
This patch series stores the stack trace within kretprobe_instance on the kprobe entry used to set up the kretprobe. This allows for DTrace-style stack aggregation between function entry and exit with tools like BPFtrace -- which would not really be doable if the stack unwinder understood kretprobe_trampoline.
We also revert commit 76094a2cf46e ("ftrace: distinguish kretprobe'd functions in trace logs") and any follow-up changes because that code is no longer necessary now that stack traces are sane. *However* this patch might be a bit contentious since the original usecase (that ftrace returns shouldn't show kretprobe_trampoline) is arguably still an issue. Feel free to drop it if you think it is wrong.
Patch changelog: v2: * documentation: mention kretprobe stack-stashing * ftrace: add self-test for fixed kretprobe stacktraces * ftrace: remove [unknown/kretprobe'd] handling * kprobe: remove needless EXPORT statements * kprobe: minor corrections to current_kretprobe_instance (switch away from hlist_for_each_entry_safe) * kprobe: make maximum stack size 127, which is the ftrace default
(I forgot to Cc the BPF folks in v1, I've added them now.)
Aleksa Sarai (2): kretprobe: produce sane stack traces trace: remove kretprobed checks
Documentation/kprobes.txt | 6 +- include/linux/kprobes.h | 15 +++ kernel/events/callchain.c | 8 +- kernel/kprobes.c | 101 +++++++++++++++++- kernel/trace/trace.c | 11 +- kernel/trace/trace_output.c | 34 +----- .../test.d/kprobe/kretprobe_stacktrace.tc | 25 +++++ 7 files changed, 165 insertions(+), 35 deletions(-) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc
Historically, kretprobe has always produced unusable stack traces (kretprobe_trampoline is the only entry in most cases, because of the funky stack pointer overwriting). This has caused quite a few annoyances when using tracing to debug problems[1] -- since return values are only available with kretprobes but stack traces were only usable for kprobes, users had to probe both and then manually associate them.
With the advent of bpf_trace, users would have been able to do this association in bpf, but this was less than ideal (because bpf_get_stackid would still produce rubbish and programs that didn't know better would get silly results). The main usecase for stack traces (at least with bpf_trace) is for DTrace-style aggregation on stack traces (both entry and exit). Therefore we cannot simply correct the stack trace on exit -- we must stash away the stack trace and return the entry stack trace when it is requested.
[1]: https://github.com/iovisor/bpftrace/issues/101
Cc: Brendan Gregg bgregg@netflix.com Cc: Christian Brauner christian@brauner.io Signed-off-by: Aleksa Sarai cyphar@cyphar.com --- Documentation/kprobes.txt | 6 +- include/linux/kprobes.h | 15 +++ kernel/events/callchain.c | 8 +- kernel/kprobes.c | 101 +++++++++++++++++- kernel/trace/trace.c | 11 +- .../test.d/kprobe/kretprobe_stacktrace.tc | 25 +++++ 6 files changed, 161 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 10f4499e677c..1965585848f4 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -597,7 +597,11 @@ address with the trampoline's address, stack backtraces and calls to __builtin_return_address() will typically yield the trampoline's address instead of the real return address for kretprobed functions. (As far as we can tell, __builtin_return_address() is used only -for instrumentation and error reporting.) +for instrumentation and error reporting.) However, since return probes +are used extensively in tracing (where stack backtraces are useful), +return probes will stash away the stack backtrace during function entry +so that return probe handlers can use the entry backtrace instead of +having a trace with just kretprobe_trampoline.
If the number of times a function is called does not match the number of times it returns, registering a return probe on that function may diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e909413e4e38..b2d6e8b74731 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -40,6 +40,8 @@ #include <linux/rcupdate.h> #include <linux/mutex.h> #include <linux/ftrace.h> +#include <linux/stacktrace.h> +#include <linux/perf_event.h> #include <asm/kprobes.h>
#ifdef CONFIG_KPROBES @@ -168,11 +170,18 @@ struct kretprobe { raw_spinlock_t lock; };
+#define KRETPROBE_TRACE_SIZE 127 +struct kretprobe_trace { + int nr_entries; + unsigned long entries[KRETPROBE_TRACE_SIZE]; +}; + struct kretprobe_instance { struct hlist_node hlist; struct kretprobe *rp; kprobe_opcode_t *ret_addr; struct task_struct *task; + struct kretprobe_trace entry; char data[0]; };
@@ -371,6 +380,12 @@ void unregister_kretprobe(struct kretprobe *rp); int register_kretprobes(struct kretprobe **rps, int num); void unregister_kretprobes(struct kretprobe **rps, int num);
+struct kretprobe_instance *current_kretprobe_instance(void); +void kretprobe_save_stack_trace(struct kretprobe_instance *ri, + struct stack_trace *trace); +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri, + struct perf_callchain_entry_ctx *ctx); + void kprobe_flush_task(struct task_struct *tk); void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 24a77c34e9ad..98edcd8a6987 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -12,6 +12,7 @@ #include <linux/perf_event.h> #include <linux/slab.h> #include <linux/sched/task_stack.h> +#include <linux/kprobes.h>
#include "internal.h"
@@ -197,9 +198,14 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, ctx.contexts_maxed = false;
if (kernel && !user_mode(regs)) { + struct kretprobe_instance *ri = current_kretprobe_instance(); + if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(&ctx, regs); + if (ri) + kretprobe_perf_callchain_kernel(ri, &ctx); + else + perf_callchain_kernel(&ctx, regs); }
if (user) { diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 90e98e233647..fca3964d18cd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1206,6 +1206,16 @@ __releases(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_unlock);
+static bool kretprobe_hash_is_locked(struct task_struct *tsk) +{ + unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); + raw_spinlock_t *hlist_lock; + + hlist_lock = kretprobe_table_lock_ptr(hash); + return raw_spin_is_locked(hlist_lock); +} +NOKPROBE_SYMBOL(kretprobe_hash_is_locked); + /* * This function is called from finish_task_switch when task tk becomes dead, * so that we can recycle any function-return probe instances associated @@ -1800,6 +1810,13 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; }
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs); + +static inline bool kprobe_is_retprobe(struct kprobe *kp) +{ + return kp->pre_handler == pre_handler_kretprobe; +} + #ifdef CONFIG_KRETPROBES /* * This kprobe pre_handler is registered with every kretprobe. When probe @@ -1826,6 +1843,8 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { + struct stack_trace trace = {}; + ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); hlist_del(&ri->hlist); @@ -1834,6 +1853,11 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) ri->rp = rp; ri->task = current;
+ trace.entries = &ri->entry.entries[0]; + trace.max_entries = KRETPROBE_TRACE_SIZE; + save_stack_trace_regs(regs, &trace); + ri->entry.nr_entries = trace.nr_entries; + if (rp->entry_handler && rp->entry_handler(ri, regs)) { raw_spin_lock_irqsave(&rp->lock, flags); hlist_add_head(&ri->hlist, &rp->free_instances); @@ -1856,6 +1880,65 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe);
+/* + * Return the kretprobe_instance associated with the current_kprobe. Calling + * this is only reasonable from within a kretprobe handler context (otherwise + * return NULL). + * + * Must be called within a kretprobe_hash_lock(current, ...) context. + */ +struct kretprobe_instance *current_kretprobe_instance(void) +{ + struct kprobe *kp; + struct kretprobe *rp; + struct kretprobe_instance *ri; + struct hlist_head *head; + unsigned long hash = hash_ptr(current, KPROBE_HASH_BITS); + + kp = kprobe_running(); + if (!kp || !kprobe_is_retprobe(kp)) + return NULL; + if (WARN_ON(!kretprobe_hash_is_locked(current))) + return NULL; + + rp = container_of(kp, struct kretprobe, kp); + head = &kretprobe_inst_table[hash]; + + hlist_for_each_entry(ri, head, hlist) { + if (ri->task == current && ri->rp == rp) + return ri; + } + return NULL; +} +EXPORT_SYMBOL_GPL(current_kretprobe_instance); +NOKPROBE_SYMBOL(current_kretprobe_instance); + +void kretprobe_save_stack_trace(struct kretprobe_instance *ri, + struct stack_trace *trace) +{ + int i; + struct kretprobe_trace *krt = &ri->entry; + + for (i = trace->skip; i < krt->nr_entries; i++) { + if (trace->nr_entries >= trace->max_entries) + break; + trace->entries[trace->nr_entries++] = krt->entries[i]; + } +} + +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri, + struct perf_callchain_entry_ctx *ctx) +{ + int i; + struct kretprobe_trace *krt = &ri->entry; + + for (i = 0; i < krt->nr_entries; i++) { + if (krt->entries[i] == ULONG_MAX) + break; + perf_callchain_store(ctx, (u64) krt->entries[i]); + } +} + bool __weak arch_kprobe_on_func_entry(unsigned long offset) { return !offset; @@ -2005,6 +2088,22 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe);
+struct kretprobe_instance *current_kretprobe_instance(void) +{ + return NULL; +} +EXPORT_SYMBOL_GPL(current_kretprobe_instance); +NOKPROBE_SYMBOL(current_kretprobe_instance); + +void kretprobe_save_stack_trace(struct kretprobe_instance *ri, + struct stack_trace *trace) +{ +} + +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri, + struct perf_callchain_entry_ctx *ctx) +{ +} #endif /* CONFIG_KRETPROBES */
/* Set the kprobe gone and remove its instruction buffer. */ @@ -2241,7 +2340,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, char *kprobe_type; void *addr = p->addr;
- if (p->pre_handler == pre_handler_kretprobe) + if (kprobe_is_retprobe(p)) kprobe_type = "r"; else kprobe_type = "k"; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bf6f1d70484d..2210d38a4dbf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -42,6 +42,7 @@ #include <linux/nmi.h> #include <linux/fs.h> #include <linux/trace.h> +#include <linux/kprobes.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h>
@@ -2590,6 +2591,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; + struct kretprobe_instance *ri = current_kretprobe_instance(); int use_stack; int size = FTRACE_STACK_ENTRIES;
@@ -2626,7 +2628,9 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.entries = this_cpu_ptr(ftrace_stack.calls); trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
- if (regs) + if (ri) + kretprobe_save_stack_trace(ri, &trace); + else if (regs) save_stack_trace_regs(regs, &trace); else save_stack_trace(&trace); @@ -2653,7 +2657,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, else { trace.max_entries = FTRACE_STACK_ENTRIES; trace.entries = entry->caller; - if (regs) + + if (ri) + kretprobe_save_stack_trace(ri, &trace); + else if (regs) save_stack_trace_regs(regs, &trace); else save_stack_trace(&trace); diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc new file mode 100644 index 000000000000..03146c6a1a3c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc @@ -0,0 +1,25 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# description: Kretprobe dynamic event with a stacktrace + +[ -f kprobe_events ] || exit_unsupported # this is configurable + +echo 0 > events/enable +echo 1 > options/stacktrace + +echo 'r:teststackprobe sched_fork $retval' > kprobe_events +grep teststackprobe kprobe_events +test -d events/kprobes/teststackprobe + +clear_trace +echo 1 > events/kprobes/teststackprobe/enable +( echo "forked") +echo 0 > events/kprobes/teststackprobe/enable + +# Make sure we don't see kretprobe_trampoline and we see _do_fork. +! grep 'kretprobe' trace +grep '_do_fork' trace + +echo '-:teststackprobe' >> kprobe_events +clear_trace +test -d events/kprobes/teststackprobe && exit_fail || exit_pass
Hi Aleksa,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on tip/perf/core] [also build test ERROR on v4.19 next-20181031] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Aleksa-Sarai/kretprobe-produce-sane... config: i386-tinyconfig (attached as .config) compiler: gcc-7 (Debian 7.3.0-1) 7.3.0 reproduce: # save the attached .config to linux build tree make ARCH=i386
All error/warnings (new ones prefixed by >>):
kernel/events/callchain.c: In function 'get_perf_callchain':
kernel/events/callchain.c:201:35: error: implicit declaration of function 'current_kretprobe_instance'; did you mean 'current_top_of_stack'? [-Werror=implicit-function-declaration]
struct kretprobe_instance *ri = current_kretprobe_instance(); ^~~~~~~~~~~~~~~~~~~~~~~~~~ current_top_of_stack
kernel/events/callchain.c:201:35: warning: initialization makes pointer from integer without a cast [-Wint-conversion] kernel/events/callchain.c:206:4: error: implicit declaration of function 'kretprobe_perf_callchain_kernel'; did you mean 'perf_callchain_kernel'? [-Werror=implicit-function-declaration]
kretprobe_perf_callchain_kernel(ri, &ctx); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ perf_callchain_kernel cc1: some warnings being treated as errors
vim +201 kernel/events/callchain.c
178 179 struct perf_callchain_entry * 180 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, 181 u32 max_stack, bool crosstask, bool add_mark) 182 { 183 struct perf_callchain_entry *entry; 184 struct perf_callchain_entry_ctx ctx; 185 int rctx; 186 187 entry = get_callchain_entry(&rctx); 188 if (rctx == -1) 189 return NULL; 190 191 if (!entry) 192 goto exit_put; 193 194 ctx.entry = entry; 195 ctx.max_stack = max_stack; 196 ctx.nr = entry->nr = init_nr; 197 ctx.contexts = 0; 198 ctx.contexts_maxed = false; 199 200 if (kernel && !user_mode(regs)) {
201 struct kretprobe_instance *ri = current_kretprobe_instance();
202 203 if (add_mark) 204 perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); 205 if (ri)
206 kretprobe_perf_callchain_kernel(ri, &ctx);
207 else 208 perf_callchain_kernel(&ctx, regs); 209 } 210 211 if (user) { 212 if (!user_mode(regs)) { 213 if (current->mm) 214 regs = task_pt_regs(current); 215 else 216 regs = NULL; 217 } 218 219 if (regs) { 220 mm_segment_t fs; 221 222 if (crosstask) 223 goto exit_put; 224 225 if (add_mark) 226 perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); 227 228 fs = get_fs(); 229 set_fs(USER_DS); 230 perf_callchain_user(&ctx, regs); 231 set_fs(fs); 232 } 233 } 234 235 exit_put: 236 put_callchain_entry(rctx); 237 238 return entry; 239 } 240
--- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
This is effectively a reversion of commit 76094a2cf46e ("ftrace: distinguish kretprobe'd functions in trace logs"), as the checking of kretprobe_trampoline *for tracing* is no longer necessary with the new kretprobe stack trace changes.
Signed-off-by: Aleksa Sarai cyphar@cyphar.com --- kernel/trace/trace_output.c | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 6e6cc64faa38..951de16bd4fd 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -321,36 +321,14 @@ int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) } EXPORT_SYMBOL_GPL(trace_output_call);
-#ifdef CONFIG_KRETPROBES -static inline const char *kretprobed(const char *name) -{ - static const char tramp_name[] = "kretprobe_trampoline"; - int size = sizeof(tramp_name); - - if (strncmp(tramp_name, name, size) == 0) - return "[unknown/kretprobe'd]"; - return name; -} -#else -static inline const char *kretprobed(const char *name) -{ - return name; -} -#endif /* CONFIG_KRETPROBES */ - static void seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_KALLSYMS - const char *name; - kallsyms_lookup(address, NULL, NULL, NULL, str); - - name = kretprobed(str); - - if (name && strlen(name)) { - trace_seq_printf(s, fmt, name); + if (strlen(str)) { + trace_seq_printf(s, fmt, str); return; } #endif @@ -364,13 +342,9 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, { char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_KALLSYMS - const char *name; - sprint_symbol(str, address); - name = kretprobed(str); - - if (name && strlen(name)) { - trace_seq_printf(s, fmt, name); + if (strlen(str)) { + trace_seq_printf(s, fmt, str); return; } #endif
linux-kselftest-mirror@lists.linaro.org