Historically, kretprobe has always produced unusable stack traces (kretprobe_trampoline is the only entry in most cases, because of the funky stack pointer overwriting). This has caused quite a few annoyances when using tracing to debug problems[1] -- since return values are only available with kretprobes but stack traces were only usable for kprobes, users had to probe both and then manually associate them.
This patch series stores the stack trace within kretprobe_instance on the kprobe entry used to set up the kretprobe. This allows for DTrace-style stack aggregation between function entry and exit with tools like BPFtrace -- which would not really be doable if the stack unwinder understood kretprobe_trampoline.
We also revert commit 76094a2cf46e ("ftrace: distinguish kretprobe'd functions in trace logs") and any follow-up changes because that code is no longer necessary now that stack traces are sane. *However* this patch might be a bit contentious since the original usecase (that ftrace returns shouldn't show kretprobe_trampoline) is arguably still an issue. Feel free to drop it if you think it is wrong.
Patch changelog: v3: * kprobe: fix build on !CONFIG_KPROBES v2: * documentation: mention kretprobe stack-stashing * ftrace: add self-test for fixed kretprobe stacktraces * ftrace: remove [unknown/kretprobe'd] handling * kprobe: remove needless EXPORT statements * kprobe: minor corrections to current_kretprobe_instance (switch away from hlist_for_each_entry_safe) * kprobe: make maximum stack size 127, which is the ftrace default
Aleksa Sarai (2): kretprobe: produce sane stack traces trace: remove kretprobed checks
Documentation/kprobes.txt | 6 +- include/linux/kprobes.h | 27 +++++ kernel/events/callchain.c | 8 +- kernel/kprobes.c | 101 +++++++++++++++++- kernel/trace/trace.c | 11 +- kernel/trace/trace_output.c | 34 +----- .../test.d/kprobe/kretprobe_stacktrace.tc | 25 +++++ 7 files changed, 177 insertions(+), 35 deletions(-) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc
Historically, kretprobe has always produced unusable stack traces (kretprobe_trampoline is the only entry in most cases, because of the funky stack pointer overwriting). This has caused quite a few annoyances when using tracing to debug problems[1] -- since return values are only available with kretprobes but stack traces were only usable for kprobes, users had to probe both and then manually associate them.
With the advent of bpf_trace, users would have been able to do this association in bpf, but this was less than ideal (because bpf_get_stackid would still produce rubbish and programs that didn't know better would get silly results). The main usecase for stack traces (at least with bpf_trace) is for DTrace-style aggregation on stack traces (both entry and exit). Therefore we cannot simply correct the stack trace on exit -- we must stash away the stack trace and return the entry stack trace when it is requested.
[1]: https://github.com/iovisor/bpftrace/issues/101
Cc: Brendan Gregg bgregg@netflix.com Cc: Christian Brauner christian@brauner.io Signed-off-by: Aleksa Sarai cyphar@cyphar.com --- Documentation/kprobes.txt | 6 +- include/linux/kprobes.h | 27 +++++ kernel/events/callchain.c | 8 +- kernel/kprobes.c | 101 +++++++++++++++++- kernel/trace/trace.c | 11 +- .../test.d/kprobe/kretprobe_stacktrace.tc | 25 +++++ 6 files changed, 173 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 10f4499e677c..1965585848f4 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -597,7 +597,11 @@ address with the trampoline's address, stack backtraces and calls to __builtin_return_address() will typically yield the trampoline's address instead of the real return address for kretprobed functions. (As far as we can tell, __builtin_return_address() is used only -for instrumentation and error reporting.) +for instrumentation and error reporting.) However, since return probes +are used extensively in tracing (where stack backtraces are useful), +return probes will stash away the stack backtrace during function entry +so that return probe handlers can use the entry backtrace instead of +having a trace with just kretprobe_trampoline.
If the number of times a function is called does not match the number of times it returns, registering a return probe on that function may diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e909413e4e38..1a1629544e56 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -40,6 +40,8 @@ #include <linux/rcupdate.h> #include <linux/mutex.h> #include <linux/ftrace.h> +#include <linux/stacktrace.h> +#include <linux/perf_event.h> #include <asm/kprobes.h>
#ifdef CONFIG_KPROBES @@ -168,11 +170,18 @@ struct kretprobe { raw_spinlock_t lock; };
+#define KRETPROBE_TRACE_SIZE 127 +struct kretprobe_trace { + int nr_entries; + unsigned long entries[KRETPROBE_TRACE_SIZE]; +}; + struct kretprobe_instance { struct hlist_node hlist; struct kretprobe *rp; kprobe_opcode_t *ret_addr; struct task_struct *task; + struct kretprobe_trace entry; char data[0]; };
@@ -371,6 +380,12 @@ void unregister_kretprobe(struct kretprobe *rp); int register_kretprobes(struct kretprobe **rps, int num); void unregister_kretprobes(struct kretprobe **rps, int num);
+struct kretprobe_instance *current_kretprobe_instance(void); +void kretprobe_save_stack_trace(struct kretprobe_instance *ri, + struct stack_trace *trace); +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri, + struct perf_callchain_entry_ctx *ctx); + void kprobe_flush_task(struct task_struct *tk); void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
@@ -397,6 +412,18 @@ static inline struct kprobe *kprobe_running(void) { return NULL; } +static inline struct kretprobe_instance *current_kretprobe_instance(void) +{ + return NULL; +} +static inline void kretprobe_save_stack_trace(struct kretprobe_instance *ri, + struct stack_trace *trace) +{ +} +static inline void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri, + struct perf_callchain_entry_ctx *ctx) +{ +} static inline int register_kprobe(struct kprobe *p) { return -ENOSYS; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 24a77c34e9ad..98edcd8a6987 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -12,6 +12,7 @@ #include <linux/perf_event.h> #include <linux/slab.h> #include <linux/sched/task_stack.h> +#include <linux/kprobes.h>
#include "internal.h"
@@ -197,9 +198,14 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, ctx.contexts_maxed = false;
if (kernel && !user_mode(regs)) { + struct kretprobe_instance *ri = current_kretprobe_instance(); + if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(&ctx, regs); + if (ri) + kretprobe_perf_callchain_kernel(ri, &ctx); + else + perf_callchain_kernel(&ctx, regs); }
if (user) { diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 90e98e233647..fca3964d18cd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1206,6 +1206,16 @@ __releases(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_unlock);
+static bool kretprobe_hash_is_locked(struct task_struct *tsk) +{ + unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); + raw_spinlock_t *hlist_lock; + + hlist_lock = kretprobe_table_lock_ptr(hash); + return raw_spin_is_locked(hlist_lock); +} +NOKPROBE_SYMBOL(kretprobe_hash_is_locked); + /* * This function is called from finish_task_switch when task tk becomes dead, * so that we can recycle any function-return probe instances associated @@ -1800,6 +1810,13 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; }
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs); + +static inline bool kprobe_is_retprobe(struct kprobe *kp) +{ + return kp->pre_handler == pre_handler_kretprobe; +} + #ifdef CONFIG_KRETPROBES /* * This kprobe pre_handler is registered with every kretprobe. When probe @@ -1826,6 +1843,8 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { + struct stack_trace trace = {}; + ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); hlist_del(&ri->hlist); @@ -1834,6 +1853,11 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) ri->rp = rp; ri->task = current;
+ trace.entries = &ri->entry.entries[0]; + trace.max_entries = KRETPROBE_TRACE_SIZE; + save_stack_trace_regs(regs, &trace); + ri->entry.nr_entries = trace.nr_entries; + if (rp->entry_handler && rp->entry_handler(ri, regs)) { raw_spin_lock_irqsave(&rp->lock, flags); hlist_add_head(&ri->hlist, &rp->free_instances); @@ -1856,6 +1880,65 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe);
+/* + * Return the kretprobe_instance associated with the current_kprobe. Calling + * this is only reasonable from within a kretprobe handler context (otherwise + * return NULL). + * + * Must be called within a kretprobe_hash_lock(current, ...) context. + */ +struct kretprobe_instance *current_kretprobe_instance(void) +{ + struct kprobe *kp; + struct kretprobe *rp; + struct kretprobe_instance *ri; + struct hlist_head *head; + unsigned long hash = hash_ptr(current, KPROBE_HASH_BITS); + + kp = kprobe_running(); + if (!kp || !kprobe_is_retprobe(kp)) + return NULL; + if (WARN_ON(!kretprobe_hash_is_locked(current))) + return NULL; + + rp = container_of(kp, struct kretprobe, kp); + head = &kretprobe_inst_table[hash]; + + hlist_for_each_entry(ri, head, hlist) { + if (ri->task == current && ri->rp == rp) + return ri; + } + return NULL; +} +EXPORT_SYMBOL_GPL(current_kretprobe_instance); +NOKPROBE_SYMBOL(current_kretprobe_instance); + +void kretprobe_save_stack_trace(struct kretprobe_instance *ri, + struct stack_trace *trace) +{ + int i; + struct kretprobe_trace *krt = &ri->entry; + + for (i = trace->skip; i < krt->nr_entries; i++) { + if (trace->nr_entries >= trace->max_entries) + break; + trace->entries[trace->nr_entries++] = krt->entries[i]; + } +} + +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri, + struct perf_callchain_entry_ctx *ctx) +{ + int i; + struct kretprobe_trace *krt = &ri->entry; + + for (i = 0; i < krt->nr_entries; i++) { + if (krt->entries[i] == ULONG_MAX) + break; + perf_callchain_store(ctx, (u64) krt->entries[i]); + } +} + bool __weak arch_kprobe_on_func_entry(unsigned long offset) { return !offset; @@ -2005,6 +2088,22 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe);
+struct kretprobe_instance *current_kretprobe_instance(void) +{ + return NULL; +} +EXPORT_SYMBOL_GPL(current_kretprobe_instance); +NOKPROBE_SYMBOL(current_kretprobe_instance); + +void kretprobe_save_stack_trace(struct kretprobe_instance *ri, + struct stack_trace *trace) +{ +} + +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri, + struct perf_callchain_entry_ctx *ctx) +{ +} #endif /* CONFIG_KRETPROBES */
/* Set the kprobe gone and remove its instruction buffer. */ @@ -2241,7 +2340,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, char *kprobe_type; void *addr = p->addr;
- if (p->pre_handler == pre_handler_kretprobe) + if (kprobe_is_retprobe(p)) kprobe_type = "r"; else kprobe_type = "k"; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bf6f1d70484d..2210d38a4dbf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -42,6 +42,7 @@ #include <linux/nmi.h> #include <linux/fs.h> #include <linux/trace.h> +#include <linux/kprobes.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h>
@@ -2590,6 +2591,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; + struct kretprobe_instance *ri = current_kretprobe_instance(); int use_stack; int size = FTRACE_STACK_ENTRIES;
@@ -2626,7 +2628,9 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.entries = this_cpu_ptr(ftrace_stack.calls); trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
- if (regs) + if (ri) + kretprobe_save_stack_trace(ri, &trace); + else if (regs) save_stack_trace_regs(regs, &trace); else save_stack_trace(&trace); @@ -2653,7 +2657,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, else { trace.max_entries = FTRACE_STACK_ENTRIES; trace.entries = entry->caller; - if (regs) + + if (ri) + kretprobe_save_stack_trace(ri, &trace); + else if (regs) save_stack_trace_regs(regs, &trace); else save_stack_trace(&trace); diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc new file mode 100644 index 000000000000..03146c6a1a3c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc @@ -0,0 +1,25 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# description: Kretprobe dynamic event with a stacktrace + +[ -f kprobe_events ] || exit_unsupported # this is configurable + +echo 0 > events/enable +echo 1 > options/stacktrace + +echo 'r:teststackprobe sched_fork $retval' > kprobe_events +grep teststackprobe kprobe_events +test -d events/kprobes/teststackprobe + +clear_trace +echo 1 > events/kprobes/teststackprobe/enable +( echo "forked") +echo 0 > events/kprobes/teststackprobe/enable + +# Make sure we don't see kretprobe_trampoline and we see _do_fork. +! grep 'kretprobe' trace +grep '_do_fork' trace + +echo '-:teststackprobe' >> kprobe_events +clear_trace +test -d events/kprobes/teststackprobe && exit_fail || exit_pass
Hi Aleksa,
On Thu, 1 Nov 2018 19:35:50 +1100 Aleksa Sarai cyphar@cyphar.com wrote: [...]
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc
Please split the test case as an independent patch.
new file mode 100644 index 000000000000..03146c6a1a3c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc @@ -0,0 +1,25 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# description: Kretprobe dynamic event with a stacktrace
+[ -f kprobe_events ] || exit_unsupported # this is configurable
+echo 0 > events/enable +echo 1 > options/stacktrace
+echo 'r:teststackprobe sched_fork $retval' > kprobe_events +grep teststackprobe kprobe_events +test -d events/kprobes/teststackprobe
Hmm, what happen if we have 2 or more kretprobes on same stack? It seems you just save stack in pre_handler, but that stack can already includes another kretprobe's trampline address...
Thank you,
+clear_trace +echo 1 > events/kprobes/teststackprobe/enable +( echo "forked") +echo 0 > events/kprobes/teststackprobe/enable
+# Make sure we don't see kretprobe_trampoline and we see _do_fork. +! grep 'kretprobe' trace +grep '_do_fork' trace
+echo '-:teststackprobe' >> kprobe_events +clear_trace
+test -d events/kprobes/teststackprobe && exit_fail || exit_pass
2.19.1
On 2018-11-02, Masami Hiramatsu mhiramat@kernel.org wrote:
Please split the test case as an independent patch.
Will do. Should the Documentation/ change also be a separate patch?
new file mode 100644 index 000000000000..03146c6a1a3c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc @@ -0,0 +1,25 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# description: Kretprobe dynamic event with a stacktrace
+[ -f kprobe_events ] || exit_unsupported # this is configurable
+echo 0 > events/enable +echo 1 > options/stacktrace
+echo 'r:teststackprobe sched_fork $retval' > kprobe_events +grep teststackprobe kprobe_events +test -d events/kprobes/teststackprobe
Hmm, what happen if we have 2 or more kretprobes on same stack? It seems you just save stack in pre_handler, but that stack can already includes another kretprobe's trampline address...
Yeah, you're quite right...
My first instinct was to do something awful like iterate over the set of "kretprobe_instance"s with ->task == current, and then correct kretprobe_trampoline entries using ->ret_addr. (I think this would be correct because each task can only be in one context at once, and in order to get to a particular kretprobe all of your caller kretprobes were inserted before you and any sibling or later kretprobe_instances will have been removed. But I might be insanely wrong on this front.)
However (as I noted in the other thread), there is a problem where kretprobe_trampoline actually stops the unwinder in its tracks and thus you only get the first kretprobe_trampoline. This is something I'm going to look into some more (despite not having made progress on it last time) since now it's something that actually needs to be fixed (and as I mentioned in the other thread, show_stack() actually works on x86 in this context unlike the other stack_trace users).
On Fri, 2 Nov 2018 08:13:43 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
On 2018-11-02, Masami Hiramatsu mhiramat@kernel.org wrote:
Please split the test case as an independent patch.
Will do. Should the Documentation/ change also be a separate patch?
I think the Documentation change can be coupled with code change if the change is small. But selftests is different, that can be backported soon for testing the stable kernels.
new file mode 100644 index 000000000000..03146c6a1a3c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc @@ -0,0 +1,25 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# description: Kretprobe dynamic event with a stacktrace
+[ -f kprobe_events ] || exit_unsupported # this is configurable
+echo 0 > events/enable +echo 1 > options/stacktrace
+echo 'r:teststackprobe sched_fork $retval' > kprobe_events +grep teststackprobe kprobe_events +test -d events/kprobes/teststackprobe
Hmm, what happen if we have 2 or more kretprobes on same stack? It seems you just save stack in pre_handler, but that stack can already includes another kretprobe's trampline address...
Yeah, you're quite right...
My first instinct was to do something awful like iterate over the set of "kretprobe_instance"s with ->task == current, and then correct kretprobe_trampoline entries using ->ret_addr. (I think this would be correct because each task can only be in one context at once, and in order to get to a particular kretprobe all of your caller kretprobes were inserted before you and any sibling or later kretprobe_instances will have been removed. But I might be insanely wrong on this front.)
yeah, you are right.
However (as I noted in the other thread), there is a problem where kretprobe_trampoline actually stops the unwinder in its tracks and thus you only get the first kretprobe_trampoline. This is something I'm going to look into some more (despite not having made progress on it last time) since now it's something that actually needs to be fixed (and as I mentioned in the other thread, show_stack() actually works on x86 in this context unlike the other stack_trace users).
I should read the unwinder code, but anyway, fixing it up in kretprobe handler context is not hard. Since each instance is on an hlist, so when we hit the kretprobe_trampoline, we can search it. However, the problem is the case where the out of kretprobe handler context. In that context we need to try to lock the hlist and search the list, which will be a costful operation.
On the other hand, func-graph tracer introduces a shadow return address stack for each task (thread), and when we hit its trampoline on the stack, we can easily search the entry from "current" task without locking the shadow stack (and we already did it). This may need to pay a cost (1 page) for each task, but smarter than kretprobe, which makes a system-wide hash-table and need to search from hlist which has return addresses of other context coexist.
Thank you,
On 2018-11-02, Masami Hiramatsu mhiramat@kernel.org wrote:
On Fri, 2 Nov 2018 08:13:43 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
On 2018-11-02, Masami Hiramatsu mhiramat@kernel.org wrote:
Please split the test case as an independent patch.
Will do. Should the Documentation/ change also be a separate patch?
I think the Documentation change can be coupled with code change if the change is small. But selftests is different, that can be backported soon for testing the stable kernels.
new file mode 100644 index 000000000000..03146c6a1a3c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc @@ -0,0 +1,25 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# description: Kretprobe dynamic event with a stacktrace
+[ -f kprobe_events ] || exit_unsupported # this is configurable
+echo 0 > events/enable +echo 1 > options/stacktrace
+echo 'r:teststackprobe sched_fork $retval' > kprobe_events +grep teststackprobe kprobe_events +test -d events/kprobes/teststackprobe
Hmm, what happen if we have 2 or more kretprobes on same stack? It seems you just save stack in pre_handler, but that stack can already includes another kretprobe's trampline address...
Yeah, you're quite right...
My first instinct was to do something awful like iterate over the set of "kretprobe_instance"s with ->task == current, and then correct kretprobe_trampoline entries using ->ret_addr. (I think this would be correct because each task can only be in one context at once, and in order to get to a particular kretprobe all of your caller kretprobes were inserted before you and any sibling or later kretprobe_instances will have been removed. But I might be insanely wrong on this front.)
yeah, you are right.
However (as I noted in the other thread), there is a problem where kretprobe_trampoline actually stops the unwinder in its tracks and thus you only get the first kretprobe_trampoline. This is something I'm going to look into some more (despite not having made progress on it last time) since now it's something that actually needs to be fixed (and as I mentioned in the other thread, show_stack() actually works on x86 in this context unlike the other stack_trace users).
I should read the unwinder code, but anyway, fixing it up in kretprobe handler context is not hard. Since each instance is on an hlist, so when we hit the kretprobe_trampoline, we can search it.
As in, find the stored stack and merge the two? Interesting idea, though Steven has shot this down because of the associated cost (I was considering making it a kprobe flag, but that felt far too dirty).
However, the problem is the case where the out of kretprobe handler context. In that context we need to try to lock the hlist and search the list, which will be a costful operation.
I think the best solution would be to unify all of the kretprobe-like things so that we don't need to handle non-kretprobe contexts for basically the same underlying idea. If we wanted to do it like this.
I think a potentially better option would be to just fix the unwinder to correctly handle kretprobes (like it handles ftrace today).
On the other hand, func-graph tracer introduces a shadow return address stack for each task (thread), and when we hit its trampoline on the stack, we can easily search the entry from "current" task without locking the shadow stack (and we already did it). This may need to pay a cost (1 page) for each task, but smarter than kretprobe, which makes a system-wide hash-table and need to search from hlist which has return addresses of other context coexist.
Probably a silly question (I've been staring at the function_graph code trying to understand how it handles return addresses -- and I'm really struggling), is this what ->ret_stack (and ->curr_ret_stack) do?
Can you explain how the .retp handling works, because I'm really missing how the core arch/ hooks know to pass the correct retp value.
On Fri, 2 Nov 2018 15:37:33 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
On 2018-11-02, Masami Hiramatsu mhiramat@kernel.org wrote:
On Fri, 2 Nov 2018 08:13:43 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
On 2018-11-02, Masami Hiramatsu mhiramat@kernel.org wrote:
Please split the test case as an independent patch.
Will do. Should the Documentation/ change also be a separate patch?
I think the Documentation change can be coupled with code change if the change is small. But selftests is different, that can be backported soon for testing the stable kernels.
new file mode 100644 index 000000000000..03146c6a1a3c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc @@ -0,0 +1,25 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# description: Kretprobe dynamic event with a stacktrace
+[ -f kprobe_events ] || exit_unsupported # this is configurable
+echo 0 > events/enable +echo 1 > options/stacktrace
+echo 'r:teststackprobe sched_fork $retval' > kprobe_events +grep teststackprobe kprobe_events +test -d events/kprobes/teststackprobe
Hmm, what happen if we have 2 or more kretprobes on same stack? It seems you just save stack in pre_handler, but that stack can already includes another kretprobe's trampline address...
Yeah, you're quite right...
My first instinct was to do something awful like iterate over the set of "kretprobe_instance"s with ->task == current, and then correct kretprobe_trampoline entries using ->ret_addr. (I think this would be correct because each task can only be in one context at once, and in order to get to a particular kretprobe all of your caller kretprobes were inserted before you and any sibling or later kretprobe_instances will have been removed. But I might be insanely wrong on this front.)
yeah, you are right.
However (as I noted in the other thread), there is a problem where kretprobe_trampoline actually stops the unwinder in its tracks and thus you only get the first kretprobe_trampoline. This is something I'm going to look into some more (despite not having made progress on it last time) since now it's something that actually needs to be fixed (and as I mentioned in the other thread, show_stack() actually works on x86 in this context unlike the other stack_trace users).
I should read the unwinder code, but anyway, fixing it up in kretprobe handler context is not hard. Since each instance is on an hlist, so when we hit the kretprobe_trampoline, we can search it.
As in, find the stored stack and merge the two? Interesting idea, though Steven has shot this down because of the associated cost (I was considering making it a kprobe flag, but that felt far too dirty).
I think we don't need the flag, we just need to register a trampoline and the trampoline must restore(just pop) the original address from shadow stack. Also the stack unwinder query restored address from the stack if it hits the unfamilier address (or registered address).
However, the problem is the case where the out of kretprobe handler context. In that context we need to try to lock the hlist and search the list, which will be a costful operation.
I think the best solution would be to unify all of the kretprobe-like things so that we don't need to handle non-kretprobe contexts for basically the same underlying idea. If we wanted to do it like this.
Yes, anyway, current kretprobe instance idea is not good, it requires locks, and user must specify "enough amount of instances" for each kretprobe.
I think a potentially better option would be to just fix the unwinder to correctly handle kretprobes (like it handles ftrace today).
I think both can be fixed by same way.
On the other hand, func-graph tracer introduces a shadow return address stack for each task (thread), and when we hit its trampoline on the stack, we can easily search the entry from "current" task without locking the shadow stack (and we already did it). This may need to pay a cost (1 page) for each task, but smarter than kretprobe, which makes a system-wide hash-table and need to search from hlist which has return addresses of other context coexist.
Probably a silly question (I've been staring at the function_graph code trying to understand how it handles return addresses -- and I'm really struggling), is this what ->ret_stack (and ->curr_ret_stack) do?
Yes.
Can you explain how the .retp handling works, because I'm really missing how the core arch/ hooks know to pass the correct retp value.
Sure, the current->ret_stack can be an hidden stack array, it will save the original return address, function address and the modified address for each entry. (Kretprobe can be searched by function address.)
Stack unwinder will check the unwinded stack entry on comparing with current->ret_stack[i].retp and if it is same, it can find the original return address from ret_stack[i].ret.
This should make things simpler and solve in generic way.
Thank you,
On Thu, 1 Nov 2018 19:35:50 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
Historically, kretprobe has always produced unusable stack traces (kretprobe_trampoline is the only entry in most cases, because of the funky stack pointer overwriting). This has caused quite a few annoyances when using tracing to debug problems[1] -- since return values are only available with kretprobes but stack traces were only usable for kprobes, users had to probe both and then manually associate them.
With the advent of bpf_trace, users would have been able to do this association in bpf, but this was less than ideal (because bpf_get_stackid would still produce rubbish and programs that didn't know better would get silly results). The main usecase for stack traces (at least with bpf_trace) is for DTrace-style aggregation on stack traces (both entry and exit). Therefore we cannot simply correct the stack trace on exit -- we must stash away the stack trace and return the entry stack trace when it is requested.
Cc: Brendan Gregg bgregg@netflix.com Cc: Christian Brauner christian@brauner.io Signed-off-by: Aleksa Sarai cyphar@cyphar.com
Documentation/kprobes.txt | 6 +- include/linux/kprobes.h | 27 +++++ kernel/events/callchain.c | 8 +- kernel/kprobes.c | 101 +++++++++++++++++- kernel/trace/trace.c | 11 +- .../test.d/kprobe/kretprobe_stacktrace.tc | 25 +++++ 6 files changed, 173 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 10f4499e677c..1965585848f4 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -597,7 +597,11 @@ address with the trampoline's address, stack backtraces and calls to __builtin_return_address() will typically yield the trampoline's address instead of the real return address for kretprobed functions. (As far as we can tell, __builtin_return_address() is used only -for instrumentation and error reporting.) +for instrumentation and error reporting.) However, since return probes +are used extensively in tracing (where stack backtraces are useful), +return probes will stash away the stack backtrace during function entry +so that return probe handlers can use the entry backtrace instead of +having a trace with just kretprobe_trampoline. If the number of times a function is called does not match the number of times it returns, registering a return probe on that function may diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e909413e4e38..1a1629544e56 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -40,6 +40,8 @@ #include <linux/rcupdate.h> #include <linux/mutex.h> #include <linux/ftrace.h> +#include <linux/stacktrace.h> +#include <linux/perf_event.h> #include <asm/kprobes.h> #ifdef CONFIG_KPROBES @@ -168,11 +170,18 @@ struct kretprobe { raw_spinlock_t lock; }; +#define KRETPROBE_TRACE_SIZE 127 +struct kretprobe_trace {
- int nr_entries;
- unsigned long entries[KRETPROBE_TRACE_SIZE];
+};
struct kretprobe_instance { struct hlist_node hlist; struct kretprobe *rp; kprobe_opcode_t *ret_addr; struct task_struct *task;
- struct kretprobe_trace entry; char data[0];
}; @@ -371,6 +380,12 @@ void unregister_kretprobe(struct kretprobe *rp); int register_kretprobes(struct kretprobe **rps, int num); void unregister_kretprobes(struct kretprobe **rps, int num); +struct kretprobe_instance *current_kretprobe_instance(void); +void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
struct stack_trace *trace);
+void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
struct perf_callchain_entry_ctx *ctx);
void kprobe_flush_task(struct task_struct *tk); void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); @@ -397,6 +412,18 @@ static inline struct kprobe *kprobe_running(void) { return NULL; } +static inline struct kretprobe_instance *current_kretprobe_instance(void) +{
- return NULL;
+} +static inline void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
struct stack_trace *trace)
+{ +} +static inline void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
struct perf_callchain_entry_ctx *ctx)
+{ +} static inline int register_kprobe(struct kprobe *p) { return -ENOSYS; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 24a77c34e9ad..98edcd8a6987 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -12,6 +12,7 @@ #include <linux/perf_event.h> #include <linux/slab.h> #include <linux/sched/task_stack.h> +#include <linux/kprobes.h> #include "internal.h" @@ -197,9 +198,14 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) {
struct kretprobe_instance *ri = current_kretprobe_instance();
- if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
perf_callchain_kernel(&ctx, regs);
if (ri)
kretprobe_perf_callchain_kernel(ri, &ctx);
else
}perf_callchain_kernel(&ctx, regs);
if (user) { diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 90e98e233647..fca3964d18cd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1206,6 +1206,16 @@ __releases(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_unlock); +static bool kretprobe_hash_is_locked(struct task_struct *tsk) +{
- unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
- raw_spinlock_t *hlist_lock;
- hlist_lock = kretprobe_table_lock_ptr(hash);
- return raw_spin_is_locked(hlist_lock);
+} +NOKPROBE_SYMBOL(kretprobe_hash_is_locked);
/*
- This function is called from finish_task_switch when task tk becomes dead,
- so that we can recycle any function-return probe instances associated
@@ -1800,6 +1810,13 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs);
+static inline bool kprobe_is_retprobe(struct kprobe *kp) +{
- return kp->pre_handler == pre_handler_kretprobe;
+}
#ifdef CONFIG_KRETPROBES /*
- This kprobe pre_handler is registered with every kretprobe. When probe
@@ -1826,6 +1843,8 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) {
struct stack_trace trace = {};
- ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); hlist_del(&ri->hlist);
@@ -1834,6 +1853,11 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) ri->rp = rp; ri->task = current;
trace.entries = &ri->entry.entries[0];
trace.max_entries = KRETPROBE_TRACE_SIZE;
save_stack_trace_regs(regs, &trace);
ri->entry.nr_entries = trace.nr_entries;
So basically your saving a copy of the stack trace for each probe, for something that may not ever be used?
This adds quite a lot of overhead for something not used often.
I think the real answer is to fix kretprobes (and I just checked, the return call of function graph tracer stack trace doesn't go past the return_to_handler function either. It's just that a stack trace was never done on the return side).
The more I look at this patch, the less I like it.
-- Steve
if (rp->entry_handler && rp->entry_handler(ri, regs)) { raw_spin_lock_irqsave(&rp->lock, flags); hlist_add_head(&ri->hlist, &rp->free_instances);
@@ -1856,6 +1880,65 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); +/*
- Return the kretprobe_instance associated with the current_kprobe. Calling
- this is only reasonable from within a kretprobe handler context (otherwise
- return NULL).
- Must be called within a kretprobe_hash_lock(current, ...) context.
- */
+struct kretprobe_instance *current_kretprobe_instance(void) +{
- struct kprobe *kp;
- struct kretprobe *rp;
- struct kretprobe_instance *ri;
- struct hlist_head *head;
- unsigned long hash = hash_ptr(current, KPROBE_HASH_BITS);
- kp = kprobe_running();
- if (!kp || !kprobe_is_retprobe(kp))
return NULL;
- if (WARN_ON(!kretprobe_hash_is_locked(current)))
return NULL;
- rp = container_of(kp, struct kretprobe, kp);
- head = &kretprobe_inst_table[hash];
- hlist_for_each_entry(ri, head, hlist) {
if (ri->task == current && ri->rp == rp)
return ri;
- }
- return NULL;
+} +EXPORT_SYMBOL_GPL(current_kretprobe_instance); +NOKPROBE_SYMBOL(current_kretprobe_instance);
+void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
struct stack_trace *trace)
+{
- int i;
- struct kretprobe_trace *krt = &ri->entry;
- for (i = trace->skip; i < krt->nr_entries; i++) {
if (trace->nr_entries >= trace->max_entries)
break;
trace->entries[trace->nr_entries++] = krt->entries[i];
- }
+}
+void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
struct perf_callchain_entry_ctx *ctx)
+{
- int i;
- struct kretprobe_trace *krt = &ri->entry;
- for (i = 0; i < krt->nr_entries; i++) {
if (krt->entries[i] == ULONG_MAX)
break;
perf_callchain_store(ctx, (u64) krt->entries[i]);
- }
+}
bool __weak arch_kprobe_on_func_entry(unsigned long offset) { return !offset; @@ -2005,6 +2088,22 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); +struct kretprobe_instance *current_kretprobe_instance(void) +{
- return NULL;
+} +EXPORT_SYMBOL_GPL(current_kretprobe_instance); +NOKPROBE_SYMBOL(current_kretprobe_instance);
+void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
struct stack_trace *trace)
+{ +}
+void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
struct perf_callchain_entry_ctx *ctx)
+{ +} #endif /* CONFIG_KRETPROBES */ /* Set the kprobe gone and remove its instruction buffer. */ @@ -2241,7 +2340,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, char *kprobe_type; void *addr = p->addr;
- if (p->pre_handler == pre_handler_kretprobe)
- if (kprobe_is_retprobe(p)) kprobe_type = "r"; else kprobe_type = "k";
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bf6f1d70484d..2210d38a4dbf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -42,6 +42,7 @@ #include <linux/nmi.h> #include <linux/fs.h> #include <linux/trace.h> +#include <linux/kprobes.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h> @@ -2590,6 +2591,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace;
- struct kretprobe_instance *ri = current_kretprobe_instance(); int use_stack; int size = FTRACE_STACK_ENTRIES;
@@ -2626,7 +2628,9 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.entries = this_cpu_ptr(ftrace_stack.calls); trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
if (regs)
if (ri)
kretprobe_save_stack_trace(ri, &trace);
else save_stack_trace(&trace);else if (regs) save_stack_trace_regs(regs, &trace);
@@ -2653,7 +2657,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, else { trace.max_entries = FTRACE_STACK_ENTRIES; trace.entries = entry->caller;
if (regs)
if (ri)
kretprobe_save_stack_trace(ri, &trace);
else save_stack_trace(&trace);else if (regs) save_stack_trace_regs(regs, &trace);
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc new file mode 100644 index 000000000000..03146c6a1a3c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc @@ -0,0 +1,25 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# description: Kretprobe dynamic event with a stacktrace
+[ -f kprobe_events ] || exit_unsupported # this is configurable
+echo 0 > events/enable +echo 1 > options/stacktrace
+echo 'r:teststackprobe sched_fork $retval' > kprobe_events +grep teststackprobe kprobe_events +test -d events/kprobes/teststackprobe
+clear_trace +echo 1 > events/kprobes/teststackprobe/enable +( echo "forked") +echo 0 > events/kprobes/teststackprobe/enable
+# Make sure we don't see kretprobe_trampoline and we see _do_fork. +! grep 'kretprobe' trace +grep '_do_fork' trace
+echo '-:teststackprobe' >> kprobe_events +clear_trace +test -d events/kprobes/teststackprobe && exit_fail || exit_pass
On 2018-11-01, Steven Rostedt rostedt@goodmis.org wrote:
On Thu, 1 Nov 2018 19:35:50 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
@@ -1834,6 +1853,11 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) ri->rp = rp; ri->task = current;
trace.entries = &ri->entry.entries[0];
trace.max_entries = KRETPROBE_TRACE_SIZE;
save_stack_trace_regs(regs, &trace);
ri->entry.nr_entries = trace.nr_entries;
So basically your saving a copy of the stack trace for each probe, for something that may not ever be used?
This adds quite a lot of overhead for something not used often.
I think the real answer is to fix kretprobes (and I just checked, the return call of function graph tracer stack trace doesn't go past the return_to_handler function either. It's just that a stack trace was never done on the return side).
The more I look at this patch, the less I like it.
That's more than fair.
There's also an issue that Masami noted -- nested kretprobes don't give you the full stack trace with this solution since the stack was already overwritten. I think that the only real option is to fix the unwinder to work in a kretprobe context -- which is what I'm looking at now.
Unfortunately, I'm having a lot of trouble understanding how the current ftrace hooking works -- ORC has a couple of ftrace hooks that seem reasonable on the surface but I don't understand (for instance) how HAVE_FUNCTION_GRAPH_RET_ADDR_PTR *actually* works. Though your comment appears to indicate that it doesn't work for stack traces?
For kretprobes I think it would be fairly easy to reconstruct what landed you into a kretprobe_trampoline by walking the set of kretprobe_instances (since all new ones are added to the head, you can get the real return address in-order).
But I still have to figure out what is actually stopping the save_stack_trace() unwinder that isn't stopping the show_stacks() unwinder (though the show_stacks() code is more ... liberal with the degree of certainty it has about the unwind).
Am I barking up the wrong tree?
On 2018-11-02, Aleksa Sarai cyphar@cyphar.com wrote:
For kretprobes I think it would be fairly easy to reconstruct what landed you into a kretprobe_trampoline by walking the set of kretprobe_instances (since all new ones are added to the head, you can get the real return address in-order).
But I still have to figure out what is actually stopping the save_stack_trace() unwinder that isn't stopping the show_stacks() unwinder (though the show_stacks() code is more ... liberal with the degree of certainty it has about the unwind).
As an aside, I just tested with the frame unwinder and it isn't thrown off-course by kretprobe_trampoline (though obviously the stack is still wrong). So I think we just need to hook into the ORC unwinder to get it to continue skipping up the stack, as well as add the rewriting code for the stack traces (for all unwinders I guess -- though ideally we should do this without having to add the same code to every architecture).
On Fri, 2 Nov 2018 17:59:32 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
As an aside, I just tested with the frame unwinder and it isn't thrown off-course by kretprobe_trampoline (though obviously the stack is still wrong). So I think we just need to hook into the ORC unwinder to get it to continue skipping up the stack, as well as add the rewriting code for the stack traces (for all unwinders I guess -- though ideally we should
I agree that this is the right solution.
do this without having to add the same code to every architecture).
True, and there's an art to consolidating the code between architectures.
I'm currently looking at function graph and seeing if I can consolidate it too. And I'm also trying to get multiple uses to hook into its infrastructure. I think I finally figured out a way to do so.
The reason it is difficult, is that you need to maintain state between the entry of a function and the exit for each task and callback that is registered. Hence, it's a 3x tuple (function stack, task, callbacks). And this must be maintained with preemption. A task may sleep for minutes, and the state needs to be retained.
The only state that must be retained is the function stack with the task, because if that gets out of sync, the system crashes. But the callback state can be removed.
Here's what is there now:
When something is registered with the function graph tracer, every task gets a shadowed stack. A hook is added to fork to add shadow stacks to new tasks. Once a shadow stack is added to a task, that shadow stack is never removed until the task exits.
When the function is entered, the real return code is stored in the shadow stack and the trampoline address is put in its place.
On return, the trampoline is called, and it will pop off the return code from the shadow stack and return to that.
The issue with multiple users, is that different users may want to trace different functions. On entry, the user could say it doesn't want to trace the current function, and the return part must not be called on exit. Keeping track of which user needs the return called is the tricky part.
Here's what I plan on implementing:
Along with a shadow stack, I was going to add a 4096 byte (one page) array that holds 64 8 byte masks to every task as well. This will allow 64 simultaneous users (which is rather extreme). If we need to support more, we could allocate another page for all tasks. The 8 byte mask will represent each depth (allowing to do this for 64 function call stack depth, which should also be enough).
Each user will be assigned one of the masks. Each bit in the mask represents the depth of the shadow stack. When a function is called, each user registered with the function graph tracer will get called (if they asked to be called for this function, via the ftrace_ops hashes) and if they want to trace the function, then the bit is set in the mask for that stack depth.
When the function exits the function and we pop off the return code from the shadow stack, we then look at all the bits set for the corresponding users, and call their return callbacks, and ignore anything that is not set.
When a user is unregistered, it the corresponding bits that represent it are cleared, and it the return callback will not be called. But the tasks being traced will still have their shadow stack to allow it to get back to normal.
I'll hopefully have a prototype ready by plumbers.
And this too will require each architecture to probably change. As a side project to this, I'm going to try to consolidate the function graph code among all the architectures as well. Not an easy task.
-- Steve
On Fri, Nov 02, 2018 at 09:16:58AM -0400, Steven Rostedt wrote:
On Fri, 2 Nov 2018 17:59:32 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
As an aside, I just tested with the frame unwinder and it isn't thrown off-course by kretprobe_trampoline (though obviously the stack is still wrong). So I think we just need to hook into the ORC unwinder to get it to continue skipping up the stack, as well as add the rewriting code for the stack traces (for all unwinders I guess -- though ideally we should
I agree that this is the right solution.
Sounds good to me.
However, it would be *really* nice if function graph and kretprobes shared the same infrastructure, like they do for function entry. There's a lot of duplicated effort there.
do this without having to add the same code to every architecture).
True, and there's an art to consolidating the code between architectures.
I'm currently looking at function graph and seeing if I can consolidate it too. And I'm also trying to get multiple uses to hook into its infrastructure. I think I finally figured out a way to do so.
The reason it is difficult, is that you need to maintain state between the entry of a function and the exit for each task and callback that is registered. Hence, it's a 3x tuple (function stack, task, callbacks). And this must be maintained with preemption. A task may sleep for minutes, and the state needs to be retained.
The only state that must be retained is the function stack with the task, because if that gets out of sync, the system crashes. But the callback state can be removed.
Here's what is there now:
When something is registered with the function graph tracer, every task gets a shadowed stack. A hook is added to fork to add shadow stacks to new tasks. Once a shadow stack is added to a task, that shadow stack is never removed until the task exits.
When the function is entered, the real return code is stored in the shadow stack and the trampoline address is put in its place.
On return, the trampoline is called, and it will pop off the return code from the shadow stack and return to that.
The issue with multiple users, is that different users may want to trace different functions. On entry, the user could say it doesn't want to trace the current function, and the return part must not be called on exit. Keeping track of which user needs the return called is the tricky part.
Here's what I plan on implementing:
Along with a shadow stack, I was going to add a 4096 byte (one page) array that holds 64 8 byte masks to every task as well. This will allow 64 simultaneous users (which is rather extreme). If we need to support more, we could allocate another page for all tasks. The 8 byte mask will represent each depth (allowing to do this for 64 function call stack depth, which should also be enough).
Each user will be assigned one of the masks. Each bit in the mask represents the depth of the shadow stack. When a function is called, each user registered with the function graph tracer will get called (if they asked to be called for this function, via the ftrace_ops hashes) and if they want to trace the function, then the bit is set in the mask for that stack depth.
When the function exits the function and we pop off the return code from the shadow stack, we then look at all the bits set for the corresponding users, and call their return callbacks, and ignore anything that is not set.
When a user is unregistered, it the corresponding bits that represent it are cleared, and it the return callback will not be called. But the tasks being traced will still have their shadow stack to allow it to get back to normal.
I'll hopefully have a prototype ready by plumbers.
Why do we need multiple users? It would be a lot simpler if we could just enforce a single user per fgraphed/kretprobed function (and return -EBUSY if it's already being traced/probed).
And this too will require each architecture to probably change. As a side project to this, I'm going to try to consolidate the function graph code among all the architectures as well. Not an easy task.
Do you mean implementing HAVE_FUNCTION_GRAPH_RET_ADDR_PTR for all the arches? If so, I think have an old crusty patch which attempted to that. I could try to dig it up if you're interested.
On Fri, 2 Nov 2018 10:43:26 -0500 Josh Poimboeuf jpoimboe@redhat.com wrote:
I'll hopefully have a prototype ready by plumbers.
Why do we need multiple users? It would be a lot simpler if we could just enforce a single user per fgraphed/kretprobed function (and return -EBUSY if it's already being traced/probed).
Because that means if function graph tracer is active, then you can't do a kretprobe, and vice versa. I'd really like to have it working for multiple users, then we could trace different graph functions and store them in different buffers. It would also allow for perf to use function graph tracer too.
And this too will require each architecture to probably change. As a side project to this, I'm going to try to consolidate the function graph code among all the architectures as well. Not an easy task.
Do you mean implementing HAVE_FUNCTION_GRAPH_RET_ADDR_PTR for all the arches? If so, I think have an old crusty patch which attempted to that. I could try to dig it up if you're interested.
I'd like to have that, but it still requires some work. But I'd just the truly architecture dependent code be in the architecture (basically the asm code), and have the ability to move most of the duplicate code out of the archs.
-- Steve
On Fri, 2 Nov 2018 12:13:07 -0400 Steven Rostedt rostedt@goodmis.org wrote:
On Fri, 2 Nov 2018 10:43:26 -0500 Josh Poimboeuf jpoimboe@redhat.com wrote:
I'll hopefully have a prototype ready by plumbers.
Why do we need multiple users? It would be a lot simpler if we could just enforce a single user per fgraphed/kretprobed function (and return -EBUSY if it's already being traced/probed).
Because that means if function graph tracer is active, then you can't do a kretprobe, and vice versa. I'd really like to have it working for multiple users, then we could trace different graph functions and store them in different buffers. It would also allow for perf to use function graph tracer too.
Steve, how woul you allow multiple users on it? Something like this?
ret_trampoline_multiple(){ list_for_each(handler, &shadow_entry[i].handlers, list) handler(shadow_entry[i]); restore_retval_and_jump_to(shadow_entry[i].orig); }
And this too will require each architecture to probably change. As a side project to this, I'm going to try to consolidate the function graph code among all the architectures as well. Not an easy task.
Do you mean implementing HAVE_FUNCTION_GRAPH_RET_ADDR_PTR for all the arches? If so, I think have an old crusty patch which attempted to that. I could try to dig it up if you're interested.
I'd like to have that, but it still requires some work. But I'd just the truly architecture dependent code be in the architecture (basically the asm code), and have the ability to move most of the duplicate code out of the archs.
I will also do that for kretprobe handlers.
Thank you,
On Sat, 3 Nov 2018 22:00:12 +0900 Masami Hiramatsu mhiramat@kernel.org wrote:
On Fri, 2 Nov 2018 12:13:07 -0400 Steven Rostedt rostedt@goodmis.org wrote:
Because that means if function graph tracer is active, then you can't do a kretprobe, and vice versa. I'd really like to have it working for multiple users, then we could trace different graph functions and store them in different buffers. It would also allow for perf to use function graph tracer too.
Steve, how woul you allow multiple users on it? Something like this?
ret_trampoline_multiple(){ list_for_each(handler, &shadow_entry[i].handlers, list) handler(shadow_entry[i]); restore_retval_and_jump_to(shadow_entry[i].orig); }
Something like that. But since it's not a single mapping between shadow entries and handlers, that is we have multiple tasks with multiple shadow entries and multiple handlers, we can't use a link list (two different parents per handler).
I was thinking of a bitmask that represents the handlers, and use that to map which handler gets called for which shadow entry for a particular task.
-- Steve
On Sat, 3 Nov 2018 09:13:41 -0400 Steven Rostedt rostedt@goodmis.org wrote:
On Sat, 3 Nov 2018 22:00:12 +0900 Masami Hiramatsu mhiramat@kernel.org wrote:
On Fri, 2 Nov 2018 12:13:07 -0400 Steven Rostedt rostedt@goodmis.org wrote:
Because that means if function graph tracer is active, then you can't do a kretprobe, and vice versa. I'd really like to have it working for multiple users, then we could trace different graph functions and store them in different buffers. It would also allow for perf to use function graph tracer too.
Steve, how woul you allow multiple users on it? Something like this?
ret_trampoline_multiple(){ list_for_each(handler, &shadow_entry[i].handlers, list) handler(shadow_entry[i]); restore_retval_and_jump_to(shadow_entry[i].orig); }
Something like that. But since it's not a single mapping between shadow entries and handlers, that is we have multiple tasks with multiple shadow entries and multiple handlers, we can't use a link list (two different parents per handler).
Yes, I understand it.
I was thinking of a bitmask that represents the handlers, and use that to map which handler gets called for which shadow entry for a particular task.
Hmm, I doubt that is too complicated and not scalable. I rather like to see the open shadow entry...
entry: [[original_retaddr][function][modified_retaddr]]
So if there are many users on same function, the entries will be like this
[[original_return_address][function][trampoline_A]] [[trampline_A][function][trampoline_B]] [[trampline_B][function][trampoline_C]]
And on the top of the stack, there is trampline_C instead of original_return_address. In this case, return to trampoline_C(), it jumps back to trampline_B() and then it jumps back to trampline_A(). And eventually it jumps back to original_return_address.
This way, we don't need allocate another bitmap/pages for the shadow stack. We only need a shadow stack for each task. Also, unwinder can easily find the trampline_C from the shadow stack and restores original_return_address. (of course trampline_A,B,C must be registered so that search function can skip it.)
Thank you,
On Sun, 4 Nov 2018 01:34:30 +0900 Masami Hiramatsu mhiramat@kernel.org wrote:
I was thinking of a bitmask that represents the handlers, and use that to map which handler gets called for which shadow entry for a particular task.
Hmm, I doubt that is too complicated and not scalable. I rather like to see the open shadow entry...
It can scale and not too complex (I already played a little with it). But that said, I'm not committed to it, and using the shadow stack is also an interesting idea.
entry: [[original_retaddr][function][modified_retaddr]]
So if there are many users on same function, the entries will be like this
[[original_return_address][function][trampoline_A]] [[trampline_A][function][trampoline_B]] [[trampline_B][function][trampoline_C]]
And on the top of the stack, there is trampline_C instead of original_return_address. In this case, return to trampoline_C(), it jumps back to trampline_B() and then it jumps back to trampline_A(). And eventually it jumps back to original_return_address.
Where are trampolines A, B, and C made? Do we also need to dynamically create them? If I register multiple function tracing ones, each one will need its own trampoline?
This way, we don't need allocate another bitmap/pages for the shadow stack. We only need a shadow stack for each task. Also, unwinder can easily find the trampline_C from the shadow stack and restores original_return_address. (of course trampline_A,B,C must be registered so that search function can skip it.)
What I was thinking was to store a count and the functions to be called:
[original_return_address] [function_A] [function_B] [function_C] [ 3 ]
Then the trampoline that processes the return codes for ftrace (and kretprobes and everyone else) can simply do:
count = pop_shadow_stack(); for (i = 0; i < count; i++) { func = pop_shadow_stack(); func(...); } return_address = pop_shadow_stack();
That way we only need to register a function to the return handler and it will be called, without worrying about making trampolines. There will just be a single trampoline that handles all the work.
-- Steve
On Sat, 3 Nov 2018 13:30:21 -0400 Steven Rostedt rostedt@goodmis.org wrote:
What I was thinking was to store a count and the functions to be called:
[original_return_address] [function_A] [function_B] [function_C] [ 3 ]
Then the trampoline that processes the return codes for ftrace (and kretprobes and everyone else) can simply do:
count = pop_shadow_stack(); for (i = 0; i < count; i++) { func = pop_shadow_stack(); func(...); } return_address = pop_shadow_stack();
That way we only need to register a function to the return handler and it will be called, without worrying about making trampolines. There will just be a single trampoline that handles all the work.
And since the most common case is a single function to call, instead of using a count, we can take advantage that kernel functions are negative numbers and do:
[original_return_address] [function_A]
----
long count;
count = pop_shadow_stack(); if (count < 0) { func = (void *)count; func(); } else { for (i = 0; i < count; i++) { [...]
The unwinder will just need to know how to handle all this :-)
-- Steve
On Sat, 3 Nov 2018 13:30:21 -0400 Steven Rostedt rostedt@goodmis.org wrote:
On Sun, 4 Nov 2018 01:34:30 +0900 Masami Hiramatsu mhiramat@kernel.org wrote:
I was thinking of a bitmask that represents the handlers, and use that to map which handler gets called for which shadow entry for a particular task.
Hmm, I doubt that is too complicated and not scalable. I rather like to see the open shadow entry...
It can scale and not too complex (I already played a little with it). But that said, I'm not committed to it, and using the shadow stack is also an interesting idea.
entry: [[original_retaddr][function][modified_retaddr]]
So if there are many users on same function, the entries will be like this
[[original_return_address][function][trampoline_A]] [[trampline_A][function][trampoline_B]] [[trampline_B][function][trampoline_C]]
And on the top of the stack, there is trampline_C instead of original_return_address. In this case, return to trampoline_C(), it jumps back to trampline_B() and then it jumps back to trampline_A(). And eventually it jumps back to original_return_address.
Where are trampolines A, B, and C made? Do we also need to dynamically create them? If I register multiple function tracing ones, each one will need its own trampoline?
No, I think tramplines are very limited. currently we will only have ftrace and kretprobe trampolines.
This way, we don't need allocate another bitmap/pages for the shadow stack. We only need a shadow stack for each task. Also, unwinder can easily find the trampline_C from the shadow stack and restores original_return_address. (of course trampline_A,B,C must be registered so that search function can skip it.)
What I was thinking was to store a count and the functions to be called:
[original_return_address] [function_A] [function_B] [function_C] [ 3 ]
Then the trampoline that processes the return codes for ftrace (and kretprobes and everyone else) can simply do:
count = pop_shadow_stack(); for (i = 0; i < count; i++) { func = pop_shadow_stack(); func(...); } return_address = pop_shadow_stack();
Ah, that's a good idea. I think we also have to store the called function entry address with the number header, but basically I agree with you.
If we have a space to store a data with the function address, that is also good to kretprobe. systemtap heavily uses "entry data" for saving some data at function entry for exit handler.
That way we only need to register a function to the return handler and it will be called, without worrying about making trampolines. There will just be a single trampoline that handles all the work.
OK, and could you make it independent from func graph tracer, so that CONFIG_KPROBES=y but CONFIG_FUNCTION_GRAPH_TRACER=n kernel can support kretprobes too.
Thank you,
On 2018-11-02, Steven Rostedt rostedt@goodmis.org wrote:
As an aside, I just tested with the frame unwinder and it isn't thrown off-course by kretprobe_trampoline (though obviously the stack is still wrong). So I think we just need to hook into the ORC unwinder to get it to continue skipping up the stack, as well as add the rewriting code for the stack traces (for all unwinders I guess -- though ideally we should
I agree that this is the right solution.
do this without having to add the same code to every architecture).
True, and there's an art to consolidating the code between architectures.
I'm currently looking at function graph and seeing if I can consolidate it too. And I'm also trying to get multiple uses to hook into its infrastructure. I think I finally figured out a way to do so.
The reason it is difficult, is that you need to maintain state between the entry of a function and the exit for each task and callback that is registered. Hence, it's a 3x tuple (function stack, task, callbacks). And this must be maintained with preemption. A task may sleep for minutes, and the state needs to be retained.
The only state that must be retained is the function stack with the task, because if that gets out of sync, the system crashes. But the callback state can be removed.
Here's what is there now:
When something is registered with the function graph tracer, every task gets a shadowed stack. A hook is added to fork to add shadow stacks to new tasks. Once a shadow stack is added to a task, that shadow stack is never removed until the task exits.
When the function is entered, the real return code is stored in the shadow stack and the trampoline address is put in its place.
On return, the trampoline is called, and it will pop off the return code from the shadow stack and return to that.
I was playing with a toy version of this using the existing kretprobe architecture (by borrowing the shadow stack concept of storing the stack_addr() -- though it's actually stored inside the existing kretprobe_instance and thus doesn't need separate push-pop code).
The rewriting of kretprobe_trampoline in the stack unwinder *works* on x86 now except for the regs handling (the current function is still incorrectly shown as kretprobe_trampoline). This is actually a general bug in ftrace as well, because (for instance) while the unwinder calls into ftrace_graph_ret_addr() while walking up the stack this isn't used to correct regs->ip in perf_callchain_kernel(). I think this is the cause of the bug in ftrace-graph (if you switch to the "frame" unwinder you might be able to see it more clearly) -- but when trying to fix it I'm having trouble figuring out what retp should be (stack_addr(regs) doesn't give the right result for some reason).
I will try to fix it up and attach it, but if you're already working on a prototype the unifies all the users that works too. The patch I have at the moment duplicates each of the key ftrace_graph_return_addr invocations with a matching kretprobe_return_addr (though there's only three of these).
On 2018-11-03, Aleksa Sarai cyphar@cyphar.com wrote:
This is actually a general bug in ftrace as well, because (for instance) while the unwinder calls into ftrace_graph_ret_addr() while walking up the stack this isn't used to correct regs->ip in perf_callchain_kernel(). I think this is the cause of the bug in ftrace-graph (if you switch to the "frame" unwinder you might be able to see it more clearly) -- but when trying to fix it I'm having trouble figuring out what retp should be (stack_addr(regs) doesn't give the right result for some reason).
@Steven, @Masami: Can you verify whether this is an actual bug? The snippet is like (arch/x86/events/core.c):
void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { /* ... */ if (perf_callchain_store(entry, regs->ip)) return; /* rest of unwind code */ }
And it seems to me that there's an ftrace_graph_ret_addr missing here (though I was having trouble figuring out what the right retp would be in this context -- both stack_addr(regs) and ®s->sp appear to be wrong when I was trying to get this to work with kretprobes -- while my patch worked with all other kretprobe_trampoline cases in the unwinder).
The same issue is present in __save_stack_trace (arch/x86/kernel/stacktrace.c). This is likely the only reason that -- as Steven said -- stacktraces wouldn't work with ftrace-graph (and thus with the refactor both of you are discussing).
Thanks.
On Sun, 4 Nov 2018 22:59:13 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
The same issue is present in __save_stack_trace (arch/x86/kernel/stacktrace.c). This is likely the only reason that -- as Steven said -- stacktraces wouldn't work with ftrace-graph (and thus with the refactor both of you are discussing).
By the way, I was playing with the the orc unwinder and stack traces from the function graph tracer return code, and got it working with the below patch. Caution, that patch also has a stack trace hardcoded in the return path of the function graph tracer, so you don't want to run function graph tracing without filtering.
You can apply the patch and do:
# cd /sys/kernel/debug/tracing # echo schedule > set_ftrace_filter # echo function_graph > current_tracer # cat trace
3) | schedule() { rcu_preempt-10 [003] .... 91.160297: <stack trace> => ftrace_return_to_handler => return_to_handler => schedule_timeout => rcu_gp_kthread => kthread => ret_from_fork 3) # 4009.085 us | } 3) | schedule() { kworker/1:0-17 [001] .... 91.163288: <stack trace> => ftrace_return_to_handler => return_to_handler => worker_thread => kthread => ret_from_fork 1) # 7000.070 us | } 1) | schedule() { rcu_preempt-10 [003] .... 91.164311: <stack trace> => ftrace_return_to_handler => return_to_handler => schedule_timeout => rcu_gp_kthread => kthread => ret_from_fork 3) # 4006.540 us | }
Where just adding the stack trace without the other code, these traces ended at "return_to_handler".
This patch is not for inclusion, it was just a test to see how to make this work.
-- Steve
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 91b2cff4b79a..4bcd646ae1f4 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -320,13 +320,15 @@ ENTRY(ftrace_graph_caller) ENDPROC(ftrace_graph_caller)
ENTRY(return_to_handler) - UNWIND_HINT_EMPTY - subq $24, %rsp + subq $8, %rsp + UNWIND_HINT_FUNC + subq $16, %rsp
/* Save the return values */ movq %rax, (%rsp) movq %rdx, 8(%rsp) movq %rbp, %rdi + leaq 16(%rsp), %rsi
call ftrace_return_to_handler
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 169b3c44ee97..aaeca73218cc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -242,13 +242,16 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = index; + + trace_dump_stack(0); }
/* * Send the trace to the ring-buffer. * @return the original return address. */ -unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +unsigned long ftrace_return_to_handler(unsigned long frame_pointer, + unsigned long *ptr) { struct ftrace_graph_ret trace; unsigned long ret; @@ -257,6 +260,8 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) trace.rettime = trace_clock_local(); barrier(); current->curr_ret_stack--; + *ptr = ret; + /* * The curr_ret_stack can be less than -1 only if it was * filtered out and it's about to return from the function.
On 2018-11-06, Steven Rostedt rostedt@goodmis.org wrote:
On Sun, 4 Nov 2018 22:59:13 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
The same issue is present in __save_stack_trace (arch/x86/kernel/stacktrace.c). This is likely the only reason that -- as Steven said -- stacktraces wouldn't work with ftrace-graph (and thus with the refactor both of you are discussing).
By the way, I was playing with the the orc unwinder and stack traces from the function graph tracer return code, and got it working with the below patch. Caution, that patch also has a stack trace hardcoded in the return path of the function graph tracer, so you don't want to run function graph tracing without filtering.
Neat!
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 169b3c44ee97..aaeca73218cc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -242,13 +242,16 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = index;
- trace_dump_stack(0);
Right, this works because save_stack is not being passed a pt_regs. But if you pass a pt_regs (as happens with bpf_getstackid -- which is what spawned this discussion) then the top-most entry of the stack will still be a trampoline because there is no ftrace_graph_ret_addr call.
(I'm struggling with how to fix this -- I can't figure out what retp should be if you have a pt_regs. ->sp doesn't appear to work -- it's off by a few bytes.)
I will attach what I have at the moment to hopefully explain what the issue I've found is (re-using the kretprobe architecture but with the shadow-stack idea).
On 2018-11-08, Aleksa Sarai cyphar@cyphar.com wrote:
I will attach what I have at the moment to hopefully explain what the issue I've found is (re-using the kretprobe architecture but with the shadow-stack idea).
Here is the patch I have at the moment (it works, except for the question I have about how to handle the top-level pt_regs -- I've marked that code with XXX).
On Thu, Nov 08, 2018 at 07:04:48PM +1100, Aleksa Sarai wrote:
On 2018-11-08, Aleksa Sarai cyphar@cyphar.com wrote:
I will attach what I have at the moment to hopefully explain what the issue I've found is (re-using the kretprobe architecture but with the shadow-stack idea).
Here is the patch I have at the moment (it works, except for the question I have about how to handle the top-level pt_regs -- I've marked that code with XXX).
-- Aleksa Sarai Senior Software Engineer (Containers) SUSE Linux GmbH https://www.cyphar.com/
--8<---------------------------------------------------------------------
Since the return address is modified by kretprobe, the various unwinders can produce invalid and confusing stack traces. ftrace mostly solved this problem by teaching each unwinder how to find the original return address for stack trace purposes. This same technique can be applied to kretprobes by simply adding a pointer to where the return address was replaced in the stack, and then looking up the relevant kretprobe_instance when a stack trace is requested.
[WIP: This is currently broken because the *first entry* will not be overwritten since it looks like the stack pointer is different when we are provided pt_regs. All other addresses are correctly handled.]
When you see this problem, what does regs->ip point to? If it's pointing to generated code, then we don't _currently_ have a way of dealing with that. If it's pointing to a real function, we can fix that with unwind hints.
On Thu, 8 Nov 2018 08:44:37 -0600 Josh Poimboeuf jpoimboe@redhat.com wrote:
On Thu, Nov 08, 2018 at 07:04:48PM +1100, Aleksa Sarai wrote:
On 2018-11-08, Aleksa Sarai cyphar@cyphar.com wrote:
I will attach what I have at the moment to hopefully explain what the issue I've found is (re-using the kretprobe architecture but with the shadow-stack idea).
Here is the patch I have at the moment (it works, except for the question I have about how to handle the top-level pt_regs -- I've marked that code with XXX).
-- Aleksa Sarai Senior Software Engineer (Containers) SUSE Linux GmbH https://www.cyphar.com/
--8<---------------------------------------------------------------------
Since the return address is modified by kretprobe, the various unwinders can produce invalid and confusing stack traces. ftrace mostly solved this problem by teaching each unwinder how to find the original return address for stack trace purposes. This same technique can be applied to kretprobes by simply adding a pointer to where the return address was replaced in the stack, and then looking up the relevant kretprobe_instance when a stack trace is requested.
[WIP: This is currently broken because the *first entry* will not be overwritten since it looks like the stack pointer is different when we are provided pt_regs. All other addresses are correctly handled.]
When you see this problem, what does regs->ip point to? If it's pointing to generated code, then we don't _currently_ have a way of dealing with that. If it's pointing to a real function, we can fix that with unwind hints.
As I replied, If the stackdump is called from kretprobe event, regs->ip always points trampoline function. Otherwise (maybe from kprobe event, or panic, BUG etc.) it always be the address which the event occurs.
So fixing regs->ip is correct.
Thank you,
On 2018-11-09, Masami Hiramatsu mhiramat@kernel.org wrote:
On Thu, 8 Nov 2018 08:44:37 -0600 Josh Poimboeuf jpoimboe@redhat.com wrote:
On Thu, Nov 08, 2018 at 07:04:48PM +1100, Aleksa Sarai wrote:
On 2018-11-08, Aleksa Sarai cyphar@cyphar.com wrote:
I will attach what I have at the moment to hopefully explain what the issue I've found is (re-using the kretprobe architecture but with the shadow-stack idea).
Here is the patch I have at the moment (it works, except for the question I have about how to handle the top-level pt_regs -- I've marked that code with XXX).
-- Aleksa Sarai Senior Software Engineer (Containers) SUSE Linux GmbH https://www.cyphar.com/
--8<---------------------------------------------------------------------
Since the return address is modified by kretprobe, the various unwinders can produce invalid and confusing stack traces. ftrace mostly solved this problem by teaching each unwinder how to find the original return address for stack trace purposes. This same technique can be applied to kretprobes by simply adding a pointer to where the return address was replaced in the stack, and then looking up the relevant kretprobe_instance when a stack trace is requested.
[WIP: This is currently broken because the *first entry* will not be overwritten since it looks like the stack pointer is different when we are provided pt_regs. All other addresses are correctly handled.]
When you see this problem, what does regs->ip point to? If it's pointing to generated code, then we don't _currently_ have a way of dealing with that. If it's pointing to a real function, we can fix that with unwind hints.
As I replied, If the stackdump is called from kretprobe event, regs->ip always points trampoline function. Otherwise (maybe from kprobe event, or panic, BUG etc.) it always be the address which the event occurs.
So fixing regs->ip is correct.
The problem is that the pointer to the *return address* is wrong (kernel_stack_pointer() gives you a different result than the function entry), it's not that regs->ip is wrong. And I'm sure that it's "wrong" because it's not possible for "regs->ip == kretprobe_trampoline" unless you are in a stack frame that has been modified by the kretprobe core.
I will take a closer look at this over the weekend -- I posted the patch to try to help explain what the underlying issue I was trying to solve with this patch series is (and why I don't think the ftrace changes proposed in the thread will completely fix them).
On Thu, 8 Nov 2018 19:04:48 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
On 2018-11-08, Aleksa Sarai cyphar@cyphar.com wrote:
I will attach what I have at the moment to hopefully explain what the issue I've found is (re-using the kretprobe architecture but with the shadow-stack idea).
Here is the patch I have at the moment (it works, except for the question I have about how to handle the top-level pt_regs -- I've marked that code with XXX).
-- Aleksa Sarai Senior Software Engineer (Containers) SUSE Linux GmbH https://www.cyphar.com/
--8<---------------------------------------------------------------------
Since the return address is modified by kretprobe, the various unwinders can produce invalid and confusing stack traces. ftrace mostly solved this problem by teaching each unwinder how to find the original return address for stack trace purposes. This same technique can be applied to kretprobes by simply adding a pointer to where the return address was replaced in the stack, and then looking up the relevant kretprobe_instance when a stack trace is requested.
[WIP: This is currently broken because the *first entry* will not be overwritten since it looks like the stack pointer is different when we are provided pt_regs. All other addresses are correctly handled.]
Signed-off-by: Aleksa Sarai cyphar@cyphar.com
arch/x86/events/core.c | 6 +++- arch/x86/include/asm/ptrace.h | 1 + arch/x86/kernel/kprobes/core.c | 5 ++-- arch/x86/kernel/stacktrace.c | 10 +++++-- arch/x86/kernel/unwind_frame.c | 2 ++ arch/x86/kernel/unwind_guess.c | 5 +++- arch/x86/kernel/unwind_orc.c | 2 ++ include/linux/kprobes.h | 15 +++++++++- kernel/kprobes.c | 55 ++++++++++++++++++++++++++++++++++ 9 files changed, 93 insertions(+), 8 deletions(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index de32741d041a..d71062702179 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2371,7 +2371,11 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re return; }
- if (perf_callchain_store(entry, regs->ip))
- /* XXX: Currently broken -- stack_addr(regs) doesn't match entry. */
- addr = regs->ip;
- //addr = ftrace_graph_ret_addr(current, &state.graph_idx, addr, stack_addr(regs));
- addr = kretprobe_ret_addr(current, addr, stack_addr(regs));
- if (perf_callchain_store(entry, addr)) return;
for (unwind_start(&state, current, regs, NULL); !unwind_done(&state); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index ee696efec99f..c4dfafd43e11 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -172,6 +172,7 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) return regs->sp; } #endif +#define stack_addr(regs) ((unsigned long *) kernel_stack_pointer(regs))
No, you should use kernel_stack_pointer(regs) itself instead of stack_addr().
#define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b0d1e81c96bb..eb4da885020c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -69,8 +69,6 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
I don't like keeping this meaningless macro... this should be replaced with generic kernel_stack_pointer() macro.
#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ @@ -568,7 +566,8 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long *sara = stack_addr(regs);
- ri->ret_addr = (kprobe_opcode_t *) *sara;
- ri->ret_addrp = (kprobe_opcode_t **) sara;
- ri->ret_addr = *ri->ret_addrp;
/* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 7627455047c2..8a4fb3109d6b 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -8,6 +8,7 @@ #include <linux/sched/task_stack.h> #include <linux/stacktrace.h> #include <linux/export.h> +#include <linux/kprobes.h> #include <linux/uaccess.h> #include <asm/stacktrace.h> #include <asm/unwind.h> @@ -37,8 +38,13 @@ static void noinline __save_stack_trace(struct stack_trace *trace, struct unwind_state state; unsigned long addr;
- if (regs)
save_stack_address(trace, regs->ip, nosched);
- if (regs) {
/* XXX: Currently broken -- stack_addr(regs) doesn't match entry. */
addr = regs->ip;
Since this part is for storing regs->ip as a top of call-stack, this seems correct code. Stack unwind will be done next block.
//addr = ftrace_graph_ret_addr(current, &state.graph_idx, addr, stack_addr(regs));
so func graph return trampoline address will be shown only when unwinding stack entries. I mean func-graph tracer is not used as an event, so it never kicks stackdump.
addr = kretprobe_ret_addr(current, addr, stack_addr(regs));
But since kretprobe will be an event, which can kick the stackdump. BTW, from kretprobe, regs->ip should always be the trampoline handler, see arch/x86/kernel/kprobes/core.c:772 :-) So it must be fixed always.
save_stack_address(trace, addr, nosched);
- }
for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); unwind_next_frame(&state)) { diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 3dc26f95d46e..47062427e9a3 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -1,4 +1,5 @@ #include <linux/sched.h> +#include <linux/kprobes.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/interrupt.h> @@ -270,6 +271,7 @@ static bool update_stack_state(struct unwind_state *state, addr = READ_ONCE_TASK_STACK(state->task, *addr_p); state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p);
}state->ip = kretprobe_ret_addr(state->task, state->ip, addr_p);
/* Save the original stack pointer for unwind_dump(): */ diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 4f0e17b90463..554fd7c5c331 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -1,5 +1,6 @@ #include <linux/sched.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include <asm/ptrace.h> #include <asm/bitops.h> #include <asm/stacktrace.h> @@ -14,8 +15,10 @@ unsigned long unwind_get_return_address(struct unwind_state *state) addr = READ_ONCE_NOCHECK(*state->sp);
- return ftrace_graph_ret_addr(state->task, &state->graph_idx,
- addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, state->sp);
- addr = kretprobe_ret_addr(state->task, addr, state->sp);
- return addr;
} EXPORT_SYMBOL_GPL(unwind_get_return_address); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 26038eacf74a..b6393500d505 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -1,5 +1,6 @@ #include <linux/module.h> #include <linux/sort.h> +#include <linux/kprobes.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> #include <asm/unwind.h> @@ -462,6 +463,7 @@ bool unwind_next_frame(struct unwind_state *state) state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, state->ip, (void *)ip_p);
state->ip = kretprobe_ret_addr(state->task, state->ip, (void *)ip_p);
state->sp = sp; state->regs = NULL; diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e909413e4e38..3a01f9998064 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -172,6 +172,7 @@ struct kretprobe_instance { struct hlist_node hlist; struct kretprobe *rp; kprobe_opcode_t *ret_addr;
- kprobe_opcode_t **ret_addrp; struct task_struct *task; char data[0];
}; @@ -203,6 +204,7 @@ static inline int kprobes_built_in(void) extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); extern int arch_trampoline_kprobe(struct kprobe *p); +extern void kretprobe_trampoline(void); #else /* CONFIG_KRETPROBES */ static inline void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) @@ -212,6 +214,9 @@ static inline int arch_trampoline_kprobe(struct kprobe *p) { return 0; } +static inline void kretprobe_trampoline(void) +{ +} #endif /* CONFIG_KRETPROBES */ extern struct kretprobe_blackpoint kretprobe_blacklist[]; @@ -341,7 +346,7 @@ struct kprobe *get_kprobe(void *addr); void kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags); void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags); -struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk); +struct hlist_head *kretprobe_inst_table_head(struct task_struct *tsk); /* kprobe_running() will just return the current_kprobe on this CPU */ static inline struct kprobe *kprobe_running(void) @@ -371,6 +376,9 @@ void unregister_kretprobe(struct kretprobe *rp); int register_kretprobes(struct kretprobe **rps, int num); void unregister_kretprobes(struct kretprobe **rps, int num); +unsigned long kretprobe_ret_addr(struct task_struct *tsk, unsigned long ret,
unsigned long *retp);
void kprobe_flush_task(struct task_struct *tk); void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); @@ -425,6 +433,11 @@ static inline void unregister_kretprobe(struct kretprobe *rp) static inline void unregister_kretprobes(struct kretprobe **rps, int num) { } +unsigned long kretprobe_ret_addr(struct task_struct *task, unsigned long ret,
unsigned long *retp)
+{
- return ret;
+} static inline void kprobe_flush_task(struct task_struct *tk) { } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 90e98e233647..ed78141664ec 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -83,6 +83,11 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } +struct hlist_head *kretprobe_inst_table_head(struct task_struct *tsk) +{
- return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+}
/* Blacklist -- list of struct kprobe_blacklist_entry */ static LIST_HEAD(kprobe_blacklist); @@ -1206,6 +1211,15 @@ __releases(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_unlock); +static bool kretprobe_hash_is_locked(struct task_struct *tsk) +{
- unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
- raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
- return raw_spin_is_locked(hlist_lock);
+} +NOKPROBE_SYMBOL(kretprobe_hash_is_locked);
/*
- This function is called from finish_task_switch when task tk becomes dead,
- so that we can recycle any function-return probe instances associated
@@ -1856,6 +1870,41 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); +unsigned long kretprobe_ret_addr(struct task_struct *tsk, unsigned long ret,
unsigned long *retp)
+{
- struct kretprobe_instance *ri;
- unsigned long flags = 0;
- struct hlist_head *head;
- bool need_lock;
- if (likely(ret != (unsigned long) &kretprobe_trampoline))
return ret;
- need_lock = !kretprobe_hash_is_locked(tsk);
- if (WARN_ON(need_lock))
kretprobe_hash_lock(tsk, &head, &flags);
- else
head = kretprobe_inst_table_head(tsk);
This may not work unless this is called from the kretprobe handler context, since if we are out of kretprobe handler context, another CPU can lock the hash table and it can be detected by kretprobe_hash_is_locked();.
So, we should check we are in the kretprobe handler context if tsk == current, if not, we definately can lock the hash lock without any warning. This can be something like;
if (is_kretprobe_handler_context()) { // kretprobe_hash_lock(current == tsk) has been locked by caller if (tsk != current && kretprobe_hash(tsk) != kretprobe_hash(current)) // the hash of tsk and current can be same. need_lock = true; } else // we should take a lock for tsk. need_lock = true;
Thank you,
- hlist_for_each_entry(ri, head, hlist) {
if (ri->task != current)
continue;
if (ri->ret_addr == (kprobe_opcode_t *) &kretprobe_trampoline)
continue;
if (ri->ret_addrp == (kprobe_opcode_t **) retp) {
ret = (unsigned long) ri->ret_addr;
goto out;
}
- }
+out:
- if (need_lock)
kretprobe_hash_unlock(tsk, &flags);
- return ret;
+} +NOKPROBE_SYMBOL(kretprobe_ret_addr);
bool __weak arch_kprobe_on_func_entry(unsigned long offset) { return !offset; @@ -2005,6 +2054,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); +unsigned long kretprobe_ret_addr(struct task_struct *tsk, unsigned long ret,
unsigned long *retp)
+{
- return ret;
+} +NOKPROBE_SYMBOL(kretprobe_ret_addr); #endif /* CONFIG_KRETPROBES */ /* Set the kprobe gone and remove its instruction buffer. */ -- 2.19.1
On 2018-11-09, Masami Hiramatsu mhiramat@kernel.org wrote:
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index ee696efec99f..c4dfafd43e11 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -172,6 +172,7 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) return regs->sp; } #endif +#define stack_addr(regs) ((unsigned long *) kernel_stack_pointer(regs))
No, you should use kernel_stack_pointer(regs) itself instead of stack_addr().
#define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b0d1e81c96bb..eb4da885020c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -69,8 +69,6 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
I don't like keeping this meaningless macro... this should be replaced with generic kernel_stack_pointer() macro.
Sure. This patch was just an example -- I can remove stack_addr() all over.
- if (regs)
save_stack_address(trace, regs->ip, nosched);
- if (regs) {
/* XXX: Currently broken -- stack_addr(regs) doesn't match entry. */
addr = regs->ip;
Since this part is for storing regs->ip as a top of call-stack, this seems correct code. Stack unwind will be done next block.
This comment was referring to the usage of stack_addr(). stack_addr() doesn't give you the right result (it isn't the address of the return address -- it's slightly wrong). This is the main issue I was having -- am I doing something wrong here?
//addr = ftrace_graph_ret_addr(current, &state.graph_idx, addr, stack_addr(regs));
so func graph return trampoline address will be shown only when unwinding stack entries. I mean func-graph tracer is not used as an event, so it never kicks stackdump.
Just to make sure I understand what you're saying -- func-graph trace will never actually call __ftrace_stack_trace? Because if it does, then this code will be necessary (and then I'm a bit confused why the unwinder has func-graph trace code -- if stack traces are never taken under func-graph then the code in the unwinder is not necessary)
My reason for commenting this out is because at this point "state" isn't initialised and thus .graph_idx would not be correctly handled during unwind (and it's the same reason I commented it out later).
addr = kretprobe_ret_addr(current, addr, stack_addr(regs));
But since kretprobe will be an event, which can kick the stackdump. BTW, from kretprobe, regs->ip should always be the trampoline handler, see arch/x86/kernel/kprobes/core.c:772 :-) So it must be fixed always.
Right, but kretprobe_ret_addr() is returning the *original* return address (and we need to do an (addr == kretprobe_trampoline)). The real problem is that stack_addr(regs) isn't the same as it is during kretprobe setup (but kretprobe_ret_addr() works everywhere else).
@@ -1856,6 +1870,41 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); +unsigned long kretprobe_ret_addr(struct task_struct *tsk, unsigned long ret,
unsigned long *retp)
+{
- struct kretprobe_instance *ri;
- unsigned long flags = 0;
- struct hlist_head *head;
- bool need_lock;
- if (likely(ret != (unsigned long) &kretprobe_trampoline))
return ret;
- need_lock = !kretprobe_hash_is_locked(tsk);
- if (WARN_ON(need_lock))
kretprobe_hash_lock(tsk, &head, &flags);
- else
head = kretprobe_inst_table_head(tsk);
This may not work unless this is called from the kretprobe handler context, since if we are out of kretprobe handler context, another CPU can lock the hash table and it can be detected by kretprobe_hash_is_locked();.
Yeah, I noticed this as well when writing it (but needed a quick impl that I could test). I will fix this, thanks!
By is_kretprobe_handler_context() I imagine you are referring to checking is_kretprobe(current_kprobe())?
So, we should check we are in the kretprobe handler context if tsk == current, if not, we definately can lock the hash lock without any warning. This can be something like;
if (is_kretprobe_handler_context()) { // kretprobe_hash_lock(current == tsk) has been locked by caller if (tsk != current && kretprobe_hash(tsk) != kretprobe_hash(current)) // the hash of tsk and current can be same. need_lock = true; } else // we should take a lock for tsk. need_lock = true;
On Sat, 10 Nov 2018 02:06:29 +1100 Aleksa Sarai asarai@suse.de wrote:
On 2018-11-09, Masami Hiramatsu mhiramat@kernel.org wrote:
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index ee696efec99f..c4dfafd43e11 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -172,6 +172,7 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) return regs->sp; } #endif +#define stack_addr(regs) ((unsigned long *) kernel_stack_pointer(regs))
No, you should use kernel_stack_pointer(regs) itself instead of stack_addr().
#define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b0d1e81c96bb..eb4da885020c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -69,8 +69,6 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
I don't like keeping this meaningless macro... this should be replaced with generic kernel_stack_pointer() macro.
Sure. This patch was just an example -- I can remove stack_addr() all over.
- if (regs)
save_stack_address(trace, regs->ip, nosched);
- if (regs) {
/* XXX: Currently broken -- stack_addr(regs) doesn't match entry. */
addr = regs->ip;
Since this part is for storing regs->ip as a top of call-stack, this seems correct code. Stack unwind will be done next block.
This comment was referring to the usage of stack_addr(). stack_addr() doesn't give you the right result (it isn't the address of the return address -- it's slightly wrong). This is the main issue I was having -- am I doing something wrong here?
Of course stack_addr() actually just returns where the stack is. It should not return address, but maybe a return address from this event happens. Note that the "regs != NULL" means you will be in the interrupt handler and it will be returned to the regs->ip.
//addr = ftrace_graph_ret_addr(current, &state.graph_idx, addr, stack_addr(regs));
so func graph return trampoline address will be shown only when unwinding stack entries. I mean func-graph tracer is not used as an event, so it never kicks stackdump.
Just to make sure I understand what you're saying -- func-graph trace will never actually call __ftrace_stack_trace? Because if it does, then this code will be necessary (and then I'm a bit confused why the unwinder has func-graph trace code -- if stack traces are never taken under func-graph then the code in the unwinder is not necessary)
You seems misunderstanding. Even if this is not called from func-graph tracer, the stack entries are already replaced with func-graph trampoline. However, regs->ip (IRQ return address) is never replaced by the func-graph trampoline.
My reason for commenting this out is because at this point "state" isn't initialised and thus .graph_idx would not be correctly handled during unwind (and it's the same reason I commented it out later).
OK, but anyway, I think we don't need it.
addr = kretprobe_ret_addr(current, addr, stack_addr(regs));
But since kretprobe will be an event, which can kick the stackdump. BTW, from kretprobe, regs->ip should always be the trampoline handler, see arch/x86/kernel/kprobes/core.c:772 :-) So it must be fixed always.
Right, but kretprobe_ret_addr() is returning the *original* return address (and we need to do an (addr == kretprobe_trampoline)). The real problem is that stack_addr(regs) isn't the same as it is during kretprobe setup (but kretprobe_ret_addr() works everywhere else).
I think stack_addr(regs) should be same when this is called from kretprobe handler context. Otherwise, yes, it is not same, but in that case, regs->ip is not kretprobe_trampoline too.
If you find kretprobe_trampoline on the "stack", of course it's address should be same as it is during kretprobe setup, but if you find kretprobe_trampoline on the regs->ip, that should always happen on kretprobe handler context. Otherwise, some critical violation happens on kretprobe_trampoline. In that case, we should dump the kretprobe_trampoline address itself, should not recover it.
@@ -1856,6 +1870,41 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); +unsigned long kretprobe_ret_addr(struct task_struct *tsk, unsigned long ret,
unsigned long *retp)
+{
- struct kretprobe_instance *ri;
- unsigned long flags = 0;
- struct hlist_head *head;
- bool need_lock;
- if (likely(ret != (unsigned long) &kretprobe_trampoline))
return ret;
- need_lock = !kretprobe_hash_is_locked(tsk);
- if (WARN_ON(need_lock))
kretprobe_hash_lock(tsk, &head, &flags);
- else
head = kretprobe_inst_table_head(tsk);
This may not work unless this is called from the kretprobe handler context, since if we are out of kretprobe handler context, another CPU can lock the hash table and it can be detected by kretprobe_hash_is_locked();.
Yeah, I noticed this as well when writing it (but needed a quick impl that I could test). I will fix this, thanks!
By is_kretprobe_handler_context() I imagine you are referring to checking is_kretprobe(current_kprobe())?
yes, that's correct :)
Thank you,
So, we should check we are in the kretprobe handler context if tsk == current, if not, we definately can lock the hash lock without any warning. This can be something like;
if (is_kretprobe_handler_context()) { // kretprobe_hash_lock(current == tsk) has been locked by caller if (tsk != current && kretprobe_hash(tsk) != kretprobe_hash(current)) // the hash of tsk and current can be same. need_lock = true; } else // we should take a lock for tsk. need_lock = true;
-- Aleksa Sarai Senior Software Engineer (Containers) SUSE Linux GmbH https://www.cyphar.com/
On 2018-11-11, Masami Hiramatsu mhiramat@kernel.org wrote:
addr = kretprobe_ret_addr(current, addr, stack_addr(regs));
But since kretprobe will be an event, which can kick the stackdump. BTW, from kretprobe, regs->ip should always be the trampoline handler, see arch/x86/kernel/kprobes/core.c:772 :-) So it must be fixed always.
Right, but kretprobe_ret_addr() is returning the *original* return address (and we need to do an (addr == kretprobe_trampoline)). The real problem is that stack_addr(regs) isn't the same as it is during kretprobe setup (but kretprobe_ret_addr() works everywhere else).
I think stack_addr(regs) should be same when this is called from kretprobe handler context. Otherwise, yes, it is not same, but in that case, regs->ip is not kretprobe_trampoline too.
I figured it out.
It should be (regs->sp - 1) (just like it is inside the relevant unwinder function for ORC). I now have a prototype which works under the frame unwinder[*] -- however under ORC you can only see the top-most function (the unwinder doesn't see the other function calls). I'm playing with ORC hints with kretprobe_trampoline to try to improve things but it's still a bit screwy.
[*]: However, I've noticed that the stack traces between the two traces no longer match. On kprobe you get function_name+1, but on kretprobe you get function_caller+foo. Obviously it's working but the return address results in slightly different stack traces. This means that stack trace aggregation between kprobe and kretprobe won't work anymore -- at least not like it did in my original patch. So I'm really not sure where to go from here.
I can send around another patchset to illustrate the problem if you like (as well as show how the current unwinding code works).
On Fri, 2 Nov 2018 09:16:58 -0400 Steven Rostedt rostedt@goodmis.org wrote:
On Fri, 2 Nov 2018 17:59:32 +1100 Aleksa Sarai cyphar@cyphar.com wrote:
As an aside, I just tested with the frame unwinder and it isn't thrown off-course by kretprobe_trampoline (though obviously the stack is still wrong). So I think we just need to hook into the ORC unwinder to get it to continue skipping up the stack, as well as add the rewriting code for the stack traces (for all unwinders I guess -- though ideally we should
I agree that this is the right solution.
do this without having to add the same code to every architecture).
True, and there's an art to consolidating the code between architectures.
I'm currently looking at function graph and seeing if I can consolidate it too. And I'm also trying to get multiple uses to hook into its infrastructure. I think I finally figured out a way to do so.
For supporting multiple users without any memory allocation, I think each user should consume the shadow stack and store on it. My old generic retstack implementation did that.
https://github.com/mhiramat/linux/commit/8804f76580cd863d555854b41b9c6df719f...
I hope this may give you any insites. My idea is to generalize shadow stack, not func graph tracer, since I don't like making kretprobe depends on func graph tracer, but only the shadow stack.
The reason it is difficult, is that you need to maintain state between the entry of a function and the exit for each task and callback that is registered. Hence, it's a 3x tuple (function stack, task, callbacks). And this must be maintained with preemption. A task may sleep for minutes, and the state needs to be retained.
Would you mean preeempt_disable()? Anyway, we just need to increment index atomically, don't we?
The only state that must be retained is the function stack with the task, because if that gets out of sync, the system crashes. But the callback state can be removed.
Here's what is there now:
When something is registered with the function graph tracer, every task gets a shadowed stack. A hook is added to fork to add shadow stacks to new tasks. Once a shadow stack is added to a task, that shadow stack is never removed until the task exits.
When the function is entered, the real return code is stored in the shadow stack and the trampoline address is put in its place.
On return, the trampoline is called, and it will pop off the return code from the shadow stack and return to that.
The issue with multiple users, is that different users may want to trace different functions. On entry, the user could say it doesn't want to trace the current function, and the return part must not be called on exit. Keeping track of which user needs the return called is the tricky part.
So that I think only the "shadow stack" part should be generalized.
Here's what I plan on implementing:
Along with a shadow stack, I was going to add a 4096 byte (one page) array that holds 64 8 byte masks to every task as well. This will allow 64 simultaneous users (which is rather extreme). If we need to support more, we could allocate another page for all tasks. The 8 byte mask will represent each depth (allowing to do this for 64 function call stack depth, which should also be enough).
Each user will be assigned one of the masks. Each bit in the mask represents the depth of the shadow stack. When a function is called, each user registered with the function graph tracer will get called (if they asked to be called for this function, via the ftrace_ops hashes) and if they want to trace the function, then the bit is set in the mask for that stack depth.
When the function exits the function and we pop off the return code from the shadow stack, we then look at all the bits set for the corresponding users, and call their return callbacks, and ignore anything that is not set.
It sounds too complicated... why we don't just open the shadow stack for each user? Of course it may requires a bit "repeat" unwind on the shadow stack, but it is simple.
Thank you,
When a user is unregistered, it the corresponding bits that represent it are cleared, and it the return callback will not be called. But the tasks being traced will still have their shadow stack to allow it to get back to normal.
I'll hopefully have a prototype ready by plumbers.
And this too will require each architecture to probably change. As a side project to this, I'm going to try to consolidate the function graph code among all the architectures as well. Not an easy task.
-- Steve
On 2018-11-02, Aleksa Sarai cyphar@cyphar.com wrote:
Unfortunately, I'm having a lot of trouble understanding how the current ftrace hooking works -- ORC has a couple of ftrace hooks that seem reasonable on the surface but I don't understand (for instance) how HAVE_FUNCTION_GRAPH_RET_ADDR_PTR *actually* works. Though your comment appears to indicate that it doesn't work for stack traces?
Sorry, I just figured out how it works! (To make sure I actually understand -- retp is a pointer to the place in the stack where the return address is stored, so it uniquely specifies what each trampoline actually points to -- this trick could also be used for kretprobes).
Hi Aleksa,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on tip/perf/core] [also build test ERROR on v4.19 next-20181101] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Aleksa-Sarai/kretprobe-produce-sane... config: i386-randconfig-h1-11021006 (attached as .config) compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4 reproduce: # save the attached .config to linux build tree make ARCH=i386
All errors (new ones prefixed by >>):
kernel/kprobes.c: In function 'pre_handler_kretprobe': kernel/kprobes.c:1846:10: error: variable 'trace' has initializer but incomplete type struct stack_trace trace = {}; ^ kernel/kprobes.c:1846:22: error: storage size of 'trace' isn't known struct stack_trace trace = {}; ^
kernel/kprobes.c:1858:3: error: implicit declaration of function 'save_stack_trace_regs' [-Werror=implicit-function-declaration]
save_stack_trace_regs(regs, &trace); ^ kernel/kprobes.c:1846:22: warning: unused variable 'trace' [-Wunused-variable] struct stack_trace trace = {}; ^ kernel/kprobes.c: In function 'kretprobe_save_stack_trace':
kernel/kprobes.c:1922:16: error: dereferencing pointer to incomplete type
for (i = trace->skip; i < krt->nr_entries; i++) { ^ kernel/kprobes.c:1923:12: error: dereferencing pointer to incomplete type if (trace->nr_entries >= trace->max_entries) ^ kernel/kprobes.c:1923:33: error: dereferencing pointer to incomplete type if (trace->nr_entries >= trace->max_entries) ^ kernel/kprobes.c:1925:8: error: dereferencing pointer to incomplete type trace->entries[trace->nr_entries++] = krt->entries[i]; ^ kernel/kprobes.c:1925:23: error: dereferencing pointer to incomplete type trace->entries[trace->nr_entries++] = krt->entries[i]; ^ cc1: some warnings being treated as errors
vim +/save_stack_trace_regs +1858 kernel/kprobes.c
1819 1820 #ifdef CONFIG_KRETPROBES 1821 /* 1822 * This kprobe pre_handler is registered with every kretprobe. When probe 1823 * hits it will set up the return probe. 1824 */ 1825 static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) 1826 { 1827 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 1828 unsigned long hash, flags = 0; 1829 struct kretprobe_instance *ri; 1830 1831 /* 1832 * To avoid deadlocks, prohibit return probing in NMI contexts, 1833 * just skip the probe and increase the (inexact) 'nmissed' 1834 * statistical counter, so that the user is informed that 1835 * something happened: 1836 */ 1837 if (unlikely(in_nmi())) { 1838 rp->nmissed++; 1839 return 0; 1840 } 1841 1842 /* TODO: consider to only swap the RA after the last pre_handler fired */ 1843 hash = hash_ptr(current, KPROBE_HASH_BITS); 1844 raw_spin_lock_irqsave(&rp->lock, flags); 1845 if (!hlist_empty(&rp->free_instances)) {
1846 struct stack_trace trace = {};
1847 1848 ri = hlist_entry(rp->free_instances.first, 1849 struct kretprobe_instance, hlist); 1850 hlist_del(&ri->hlist); 1851 raw_spin_unlock_irqrestore(&rp->lock, flags); 1852 1853 ri->rp = rp; 1854 ri->task = current; 1855 1856 trace.entries = &ri->entry.entries[0]; 1857 trace.max_entries = KRETPROBE_TRACE_SIZE;
1858 save_stack_trace_regs(regs, &trace);
1859 ri->entry.nr_entries = trace.nr_entries; 1860 1861 if (rp->entry_handler && rp->entry_handler(ri, regs)) { 1862 raw_spin_lock_irqsave(&rp->lock, flags); 1863 hlist_add_head(&ri->hlist, &rp->free_instances); 1864 raw_spin_unlock_irqrestore(&rp->lock, flags); 1865 return 0; 1866 } 1867 1868 arch_prepare_kretprobe(ri, regs); 1869 1870 /* XXX(hch): why is there no hlist_move_head? */ 1871 INIT_HLIST_NODE(&ri->hlist); 1872 kretprobe_table_lock(hash, &flags); 1873 hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]); 1874 kretprobe_table_unlock(hash, &flags); 1875 } else { 1876 rp->nmissed++; 1877 raw_spin_unlock_irqrestore(&rp->lock, flags); 1878 } 1879 return 0; 1880 } 1881 NOKPROBE_SYMBOL(pre_handler_kretprobe); 1882 1883 /* 1884 * Return the kretprobe_instance associated with the current_kprobe. Calling 1885 * this is only reasonable from within a kretprobe handler context (otherwise 1886 * return NULL). 1887 * 1888 * Must be called within a kretprobe_hash_lock(current, ...) context. 1889 */ 1890 struct kretprobe_instance *current_kretprobe_instance(void) 1891 { 1892 struct kprobe *kp; 1893 struct kretprobe *rp; 1894 struct kretprobe_instance *ri; 1895 struct hlist_head *head; 1896 unsigned long hash = hash_ptr(current, KPROBE_HASH_BITS); 1897 1898 kp = kprobe_running(); 1899 if (!kp || !kprobe_is_retprobe(kp)) 1900 return NULL; 1901 if (WARN_ON(!kretprobe_hash_is_locked(current))) 1902 return NULL; 1903 1904 rp = container_of(kp, struct kretprobe, kp); 1905 head = &kretprobe_inst_table[hash]; 1906 1907 hlist_for_each_entry(ri, head, hlist) { 1908 if (ri->task == current && ri->rp == rp) 1909 return ri; 1910 } 1911 return NULL; 1912 } 1913 EXPORT_SYMBOL_GPL(current_kretprobe_instance); 1914 NOKPROBE_SYMBOL(current_kretprobe_instance); 1915 1916 void kretprobe_save_stack_trace(struct kretprobe_instance *ri, 1917 struct stack_trace *trace) 1918 { 1919 int i; 1920 struct kretprobe_trace *krt = &ri->entry; 1921
1922 for (i = trace->skip; i < krt->nr_entries; i++) {
1923 if (trace->nr_entries >= trace->max_entries) 1924 break; 1925 trace->entries[trace->nr_entries++] = krt->entries[i]; 1926 } 1927 } 1928
--- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
This is effectively a reversion of commit 76094a2cf46e ("ftrace: distinguish kretprobe'd functions in trace logs"), as the checking of kretprobe_trampoline *for tracing* is no longer necessary with the new kretprobe stack trace changes.
Signed-off-by: Aleksa Sarai cyphar@cyphar.com --- kernel/trace/trace_output.c | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 6e6cc64faa38..951de16bd4fd 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -321,36 +321,14 @@ int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) } EXPORT_SYMBOL_GPL(trace_output_call);
-#ifdef CONFIG_KRETPROBES -static inline const char *kretprobed(const char *name) -{ - static const char tramp_name[] = "kretprobe_trampoline"; - int size = sizeof(tramp_name); - - if (strncmp(tramp_name, name, size) == 0) - return "[unknown/kretprobe'd]"; - return name; -} -#else -static inline const char *kretprobed(const char *name) -{ - return name; -} -#endif /* CONFIG_KRETPROBES */ - static void seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_KALLSYMS - const char *name; - kallsyms_lookup(address, NULL, NULL, NULL, str); - - name = kretprobed(str); - - if (name && strlen(name)) { - trace_seq_printf(s, fmt, name); + if (strlen(str)) { + trace_seq_printf(s, fmt, str); return; } #endif @@ -364,13 +342,9 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, { char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_KALLSYMS - const char *name; - sprint_symbol(str, address); - name = kretprobed(str); - - if (name && strlen(name)) { - trace_seq_printf(s, fmt, name); + if (strlen(str)) { + trace_seq_printf(s, fmt, str); return; } #endif
linux-kselftest-mirror@lists.linaro.org