Hi,
Thank you Steven and Daniel for reviewing v1 and providing suggestions. These set of patches [v3] capture latency events caused by interrupts and premption disabled in kernel. The patches are based on the hist trigger feature developed by Tom Zanussi.
v2: https://lkml.org/lkml/2016/8/24/296
Mount Debugfs
mount -t debugfs nodev /sys/kernel/debug
Examples of triggers: echo 'hist:key=latency.log2:val=hitcount:sort=latency' > /sys/kernel/debug/tracing/events/latency/latency_preempt/trigger echo 'hist:key=cpu,latency:val=hitcount:sort=latency if cpu==1' > /sys/kernel/debug/tracing/events/latency/latency_preempt/trigger echo 'hist:key=ltype,cpu:val=latency:sort=ltype,cpu if ltype==0' > /sys/kernel/debug/tracing/events/latency/latency_preempt/trigger
Histogram output: cat /sys/kernel/debug/tracing/events/latency/latency_preempt/hist
Changes from v2 as per comments from Daniel - Use single tracepoint for irq/preempt/critical timings by introducing a trace type field to differentiate trace type in the same tracepoint. The suspicious RCU usage error was fixed and made part of Daniel's bugfix.
TODO: 1. perf interface. Not sure if this is needed 2. Latency histograms - process wakeup latency
Binoy
Binoy Jayan (2): tracing: Add trace_irqsoff tracepoints tracing: Histogram for missed timer offsets
Daniel Wagner (1): tracing: Deference pointers without RCU checks
include/linux/hrtimer.h | 3 ++ include/linux/rculist.h | 36 ++++++++++++++++++++ include/linux/tracepoint.h | 4 +-- include/trace/events/latency.h | 68 +++++++++++++++++++++++++++++++++++++ kernel/time/hrtimer.c | 39 +++++++++++++++++++++ kernel/trace/trace_events_filter.c | 4 +-- kernel/trace/trace_events_trigger.c | 6 ++-- kernel/trace/trace_irqsoff.c | 45 +++++++++++++++++++++++- 8 files changed, 197 insertions(+), 8 deletions(-) create mode 100644 include/trace/events/latency.h
From: Daniel Wagner daniel.wagner@bmw-carit.de
The tracepoint can't be used in code section where we are in the middle of a state transition.
For example if we place a tracepoint inside start/stop_critical_section(), lockdep complains with
[ 0.035589] WARNING: CPU: 0 PID: 3 at kernel/locking/lockdep.c:3560 \ check_flags.part.36+0x1bc/0x210() [ 0.036000] \ DEBUG_LOCKS_WARN_ON(current->softirqs_enabled) [ 0.036000] Kernel panic - not \ syncing: panic_on_warn set ... [ 0.036000] [ 0.036000] CPU: 0 PID: 3 Comm: ksoftirqd/0 Not tainted 4.0.0-rc7+ #460 [ 0.036000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS \ 1.7.5-20140709_153950- 04/01/2014 [ 0.036000] ffffffff81f2463a ffff88007c93bb98 \ ffffffff81afb317 0000000000000001 [ 0.036000] ffffffff81f212b3 ffff88007c93bc18 \ ffffffff81af7bc2 ffff88007c93bbb8 [ 0.036000] ffffffff00000008 ffff88007c93bc28 \ ffff88007c93bbc8 000000000093bbd8 [ 0.036000] Call Trace: [ 0.036000] [<ffffffff81afb317>] dump_stack+0x4f/0x7b [ 0.036000] [<ffffffff81af7bc2>] panic+0xc0/0x1e9 [ 0.036000] [<ffffffff81b07188>] ? _raw_spin_unlock_irqrestore+0x38/0x80 [ 0.036000] [<ffffffff81052a60>] warn_slowpath_common+0xc0/0xc0 [ 0.036000] [<ffffffff81057116>] ? __local_bh_enable+0x36/0x70 [ 0.036000] [<ffffffff81052aa6>] warn_slowpath_fmt+0x46/0x50 [ 0.036000] [<ffffffff810a468c>] check_flags.part.36+0x1bc/0x210 [ 0.036000] [<ffffffff810a5168>] lock_is_held+0x78/0x90 [ 0.036000] [<ffffffff81057116>] ? __local_bh_enable+0x36/0x70 [ 0.036000] [<ffffffff810588ab>] ? __do_softirq+0x3db/0x500 [ 0.036000] [<ffffffff81134495>] trace_preempt_on+0x255/0x260 [ 0.036000] [<ffffffff81081c0b>] preempt_count_sub+0xab/0xf0 [ 0.036000] [<ffffffff81057116>] __local_bh_enable+0x36/0x70 [ 0.036000] [<ffffffff810588ab>] __do_softirq+0x3db/0x500 [ 0.036000] [<ffffffff810589ef>] run_ksoftirqd+0x1f/0x60 [ 0.036000] [<ffffffff81079653>] smpboot_thread_fn+0x193/0x2a0 [ 0.036000] [<ffffffff810794c0>] ? SyS_setgroups+0x150/0x150 [ 0.036000] [<ffffffff81075312>] kthread+0xf2/0x110 [ 0.036000] [<ffffffff81b026b3>] ? wait_for_completion+0xc3/0x120 [ 0.036000] [<ffffffff81081c0b>] ? preempt_count_sub+0xab/0xf0 [ 0.036000] [<ffffffff81075220>] ? kthread_create_on_node+0x240/0x240 [ 0.036000] [<ffffffff81b07cc8>] ret_from_fork+0x58/0x90 [ 0.036000] [<ffffffff81075220>] ? kthread_create_on_node+0x240/0x240 [ 0.036000] ---[ end Kernel panic - not syncing: panic_on_warn set ...
PeterZ was so kind to explain it to me what is happening:
"__local_bh_enable() tests if this is the last SOFTIRQ_OFFSET, if so it tells lockdep softirqs are enabled with trace_softirqs_on() after that we go an actually modify the preempt_count with preempt_count_sub(). Then in preempt_count_sub() you call into trace_preempt_on() if this was the last preempt_count increment but you do that _before_ you actually change the preempt_count with __preempt_count_sub() at this point lockdep and preempt_count think the world differs and *boom*"
So the simplest way to avoid this is by disabling the consistency checks.
We also need to take care of the iterating in trace_events_trigger.c to avoid a splatter in conjunction with the hist trigger.
Signed-off-by: Daniel Wagner daniel.wagner@bmw-carit.de Signed-off-by: Binoy Jayan binoy.jayan@linaro.org --- include/linux/rculist.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/tracepoint.h | 4 ++-- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_events_trigger.c | 6 +++--- 4 files changed, 43 insertions(+), 7 deletions(-)
diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8beb98d..bee836b 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -279,6 +279,24 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, container_of(lockless_dereference(ptr), type, member)
/** + * list_entry_rcu_notrace - get the struct for this entry (for tracing) + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). + * + * This is the same as list_entry_rcu() except that it does + * not do any RCU debugging or tracing. + */ +#define list_entry_rcu_notrace(ptr, type, member) \ +({ \ + typeof(*ptr) __rcu *__ptr = (typeof(*ptr) __rcu __force *)ptr; \ + container_of((typeof(ptr))rcu_dereference_raw_notrace(__ptr), type, member); \ +}) + +/** * Where are list_empty_rcu() and list_first_entry_rcu()? * * Implementing those functions following their counterparts list_empty() and @@ -391,6 +409,24 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, pos = list_entry_lockless(pos->member.next, typeof(*pos), member))
/** + * list_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as list_add_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + * + * This is the same as list_for_each_entry_rcu() except that it does + * not do any RCU debugging or tracing. + */ +#define list_for_each_entry_rcu_notrace(pos, head, member) \ + for (pos = list_entry_rcu_notrace((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry_rcu_notrace(pos->member.next, typeof(*pos), member)) + +/** * list_for_each_entry_continue_rcu - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index be586c6..d0e2a82 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -138,7 +138,7 @@ extern void syscall_unregfunc(void); return; \ prercu; \ rcu_read_lock_sched_notrace(); \ - it_func_ptr = rcu_dereference_sched((tp)->funcs); \ + it_func_ptr = rcu_dereference_raw_notrace((tp)->funcs); \ if (it_func_ptr) { \ do { \ it_func = (it_func_ptr)->func; \ @@ -189,7 +189,7 @@ extern void syscall_unregfunc(void); TP_CONDITION(cond),,); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ rcu_read_lock_sched_notrace(); \ - rcu_dereference_sched(__tracepoint_##name.funcs);\ + rcu_dereference_raw_notrace(__tracepoint_##name.funcs);\ rcu_read_unlock_sched_notrace(); \ } \ } \ diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9daa9b3..a00a8d5 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -608,11 +608,11 @@ int filter_match_preds(struct event_filter *filter, void *rec) /* * n_preds, root and filter->preds are protect with preemption disabled. */ - root = rcu_dereference_sched(filter->root); + root = rcu_dereference_raw_notrace(filter->root); if (!root) return 1;
- data.preds = preds = rcu_dereference_sched(filter->preds); + data.preds = preds = rcu_dereference_raw_notrace(filter->preds); ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); WARN_ON(ret); return data.match; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a975571..4489fb4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -71,14 +71,14 @@ event_triggers_call(struct trace_event_file *file, void *rec) if (list_empty(&file->triggers)) return tt;
- list_for_each_entry_rcu(data, &file->triggers, list) { + list_for_each_entry_rcu_notrace(data, &file->triggers, list) { if (data->paused) continue; if (!rec) { data->ops->func(data, rec); continue; } - filter = rcu_dereference_sched(data->filter); + filter = rcu_dereference_raw_notrace(data->filter); if (filter && !filter_match_preds(filter, rec)) continue; if (event_command_post_trigger(data->cmd_ops)) { @@ -111,7 +111,7 @@ event_triggers_post_call(struct trace_event_file *file, { struct event_trigger_data *data;
- list_for_each_entry_rcu(data, &file->triggers, list) { + list_for_each_entry_rcu_notrace(data, &file->triggers, list) { if (data->paused) continue; if (data->cmd_ops->trigger_type & tt)
This work is based on work by Daniel Wagner. A few tracepoints are added at the end of the critical section. With the hist trigger in place, the hist trigger plots may be generated, with per-cpu breakdown of events captured. It is based on linux kernel's event infrastructure.
The following filter(s) may be used
'hist:key=latency.log2:val=hitcount:sort=latency' 'hist:key=cpu,latency:val=hitcount:sort=latency if cpu==1' 'hist:key=ltype,cpu:val=latency:sort=ltype,cpu if ltype==0'
Where ltype is 0: IRQSOFF latency 1: PREEMPTOFF Latency 2: Critical Timings
This captures only the latencies introduced by disabled irqs and preemption. Additional per process data has to be captured to calculate the effective latencies introduced for individual processes.
Initial work - latency.patch
[1] https://git.kernel.org/cgit/linux/kernel/git/rt/linux-stable-rt.git/commit/?...
Signed-off-by: Binoy Jayan binoy.jayan@linaro.org --- include/trace/events/latency.h | 37 ++++++++++++++++++++++++++++++++++ kernel/trace/trace_irqsoff.c | 45 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 include/trace/events/latency.h
diff --git a/include/trace/events/latency.h b/include/trace/events/latency.h new file mode 100644 index 0000000..1c34536 --- /dev/null +++ b/include/trace/events/latency.h @@ -0,0 +1,37 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM latency + +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HIST_H + +#include <linux/tracepoint.h> + +DECLARE_EVENT_CLASS(latency_template, + TP_PROTO(int ltype, int cpu, cycles_t latency), + + TP_ARGS(ltype, cpu, latency), + + TP_STRUCT__entry( + __field(int, ltype) + __field(int, cpu) + __field(cycles_t, latency) + ), + + TP_fast_assign( + __entry->ltype = ltype; + __entry->cpu = cpu; + __entry->latency = latency; + ), + + TP_printk("ltype=%d, cpu=%d, latency=%lu", + __entry->ltype, __entry->cpu, (unsigned long) __entry->latency) +); + +DEFINE_EVENT(latency_template, latency_preempt, + TP_PROTO(int ltype, int cpu, cycles_t latency), + TP_ARGS(ltype, cpu, latency)); + +#endif /* _TRACE_HIST_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 03cdff8..d4f2b25 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -13,14 +13,27 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> +#include <linux/percpu-defs.h> + +#include <trace/events/sched.h>
#include "trace.h"
+#define CREATE_TRACE_POINTS +#include <trace/events/latency.h> + static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly;
-static DEFINE_PER_CPU(int, tracing_cpu); +enum latency_type { + TS_IRQ, + TS_PREEMPT, + TS_CRITTIME, + TS_MAX +};
+static DEFINE_PER_CPU(int, tracing_cpu); +static DEFINE_PER_CPU(cycle_t, lat_ts[TS_MAX]); static DEFINE_RAW_SPINLOCK(max_trace_lock);
enum { @@ -419,9 +432,19 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_dec(&data->disabled); }
+static inline void latency_trace(enum latency_type type) +{ + trace_latency_preempt(type, raw_smp_processor_id(), + (cycle_t) trace_clock_local() - this_cpu_read(lat_ts[type])); + +} /* start and stop critical timings used to for stoppage (in idle) */ void start_critical_timings(void) { + if (trace_latency_preempt_enabled()) + this_cpu_write(lat_ts[TS_CRITTIME], + (cycle_t) trace_clock_local()); + if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -431,6 +454,10 @@ void stop_critical_timings(void) { if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + + if (trace_latency_preempt_enabled()) + latency_trace(TS_CRITTIME); + } EXPORT_SYMBOL_GPL(stop_critical_timings);
@@ -438,6 +465,9 @@ EXPORT_SYMBOL_GPL(stop_critical_timings); #ifdef CONFIG_PROVE_LOCKING void time_hardirqs_on(unsigned long a0, unsigned long a1) { + if (trace_latency_preempt_enabled()) { + latency_trace(TS_IRQ); + } if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } @@ -446,6 +476,11 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) { if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); + + if (trace_latency_preempt_enabled()) { + this_cpu_write(lat_ts[TS_IRQ], + (cycle_t) trace_clock_local()); + } }
#else /* !CONFIG_PROVE_LOCKING */ @@ -503,6 +538,9 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { + if (trace_latency_preempt_enabled()) + latency_trace(TS_PREEMPT); + if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } @@ -511,6 +549,11 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); + + if (trace_latency_preempt_enabled()) { + this_cpu_write(lat_ts[TS_PREEMPT], + (cycle_t) trace_clock_local()); + } } #endif /* CONFIG_PREEMPT_TRACER */
Hi Binoy,
Some minor nitpicking.
On 08/29/2016 08:55 AM, Binoy Jayan wrote:
+DECLARE_EVENT_CLASS(latency_template,
- TP_PROTO(int ltype, int cpu, cycles_t latency),
- TP_ARGS(ltype, cpu, latency),
- TP_STRUCT__entry(
__field(int, ltype)
__field(int, cpu)
__field(cycles_t, latency)
- ),
- TP_fast_assign(
__entry->ltype = ltype;
__entry->cpu = cpu;
__entry->latency = latency;
- ),
- TP_printk("ltype=%d, cpu=%d, latency=%lu",
__entry->ltype, __entry->cpu, (unsigned long) __entry->latency)
+);
As Steven already pointed out, the cpu field is available in all traces. It's one of the predefined fields (see kernel/trace/trace_events.c/trace_define_generic_fields()). Just drop it.
+DEFINE_EVENT(latency_template, latency_preempt,
TP_PROTO(int ltype, int cpu, cycles_t latency),
TP_ARGS(ltype, cpu, latency));
+#endif /* _TRACE_HIST_H */
+/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 03cdff8..d4f2b25 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -13,14 +13,27 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> +#include <linux/percpu-defs.h>
+#include <trace/events/sched.h>
#include "trace.h"
+#define CREATE_TRACE_POINTS +#include <trace/events/latency.h>
static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly;
-static DEFINE_PER_CPU(int, tracing_cpu); +enum latency_type {
TS_IRQ,
TS_PREEMPT,
TS_CRITTIME,
TS_MAX
+};
Come to think of it, the TS_ prefix doesn't really make sense, maybe LT_ would be more consistent (short hand for latency type).
+static DEFINE_PER_CPU(int, tracing_cpu); +static DEFINE_PER_CPU(cycle_t, lat_ts[TS_MAX]); static DEFINE_RAW_SPINLOCK(max_trace_lock);
enum { @@ -419,9 +432,19 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_dec(&data->disabled); }
+static inline void latency_trace(enum latency_type type) +{
- trace_latency_preempt(type, raw_smp_processor_id(),
(cycle_t) trace_clock_local() - this_cpu_read(lat_ts[type]));
+}
empty line here
/* start and stop critical timings used to for stoppage (in idle) */ void start_critical_timings(void) {
- if (trace_latency_preempt_enabled())
this_cpu_write(lat_ts[TS_CRITTIME],
(cycle_t) trace_clock_local());
Introduce a inline function like the latency_trace() function. No need to open code this logic 3 times.
if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } @@ -431,6 +454,10 @@ void stop_critical_timings(void) { if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
- if (trace_latency_preempt_enabled())
latency_trace(TS_CRITTIME);
} EXPORT_SYMBOL_GPL(stop_critical_timings);
@@ -438,6 +465,9 @@ EXPORT_SYMBOL_GPL(stop_critical_timings); #ifdef CONFIG_PROVE_LOCKING void time_hardirqs_on(unsigned long a0, unsigned long a1) {
- if (trace_latency_preempt_enabled()) {
latency_trace(TS_IRQ);
- }
Please follow the style of the rest of the file. Just drop the brackets.
if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } @@ -446,6 +476,11 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) { if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1);
- if (trace_latency_preempt_enabled()) {
this_cpu_write(lat_ts[TS_IRQ],
(cycle_t) trace_clock_local());
- }
Same thing.
}
#else /* !CONFIG_PROVE_LOCKING */ @@ -503,6 +538,9 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) {
- if (trace_latency_preempt_enabled())
latency_trace(TS_PREEMPT);
- if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1);
} @@ -511,6 +549,11 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1);
- if (trace_latency_preempt_enabled()) {
this_cpu_write(lat_ts[TS_PREEMPT],
(cycle_t) trace_clock_local());
- }
And here too.
} #endif /* CONFIG_PREEMPT_TRACER */
cheers, daniel
On 29 August 2016 at 15:13, Daniel Wagner daniel.wagner@bmw-carit.de wrote:
Hi Binoy,
Some minor nitpicking.
As Steven already pointed out, the cpu field is available in all traces. It's one of the predefined fields (see kernel/trace/trace_events.c/trace_define_generic_fields()). Just drop it.
Hi Daniel,
Thank you for looking at the patch again. When I do not use cpu as a field, I am not able to make it part of the key or value fields. I can only use cpu in the filter predicate.
For example, the following trigger does not seem to work: 'hist:key=ltype,cpu:val=latency:sort=ltype,cpu if ltype==0'
But the following works: 'hist:key=ltype,latency:val=hitcount:sort=latency if cpu==1'
-Binoy
On Tue, 30 Aug 2016 15:11:26 +0530 Binoy Jayan binoy.jayan@linaro.org wrote:
On 29 August 2016 at 15:13, Daniel Wagner daniel.wagner@bmw-carit.de wrote:
Hi Binoy,
Some minor nitpicking.
As Steven already pointed out, the cpu field is available in all traces. It's one of the predefined fields (see kernel/trace/trace_events.c/trace_define_generic_fields()). Just drop it.
Hi Daniel,
Thank you for looking at the patch again. When I do not use cpu as a field, I am not able to make it part of the key or value fields. I can only use cpu in the filter predicate.
For example, the following trigger does not seem to work: 'hist:key=ltype,cpu:val=latency:sort=ltype,cpu if ltype==0'
But the following works: 'hist:key=ltype,latency:val=hitcount:sort=latency if cpu==1'
What about using "CPU" (all caps)? Look at kernel/trace/trace_events_filter.c for FILTER_CPU.
Lets not add fields when we can fix the generic code to process these.
Thanks!
-- Steve
Hi Steven,
'hist:key=ltype,latency:val=hitcount:sort=latency if cpu==1'
What about using "CPU" (all caps)? Look at kernel/trace/trace_events_filter.c for FILTER_CPU.
Lets not add fields when we can fix the generic code to process these.
Well at least the size info needs to be added to the trace field. Without it hist will bail out with no memory.
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 03c0a48..ab8958f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -152,7 +152,8 @@ EXPORT_SYMBOL_GPL(trace_define_field);
#define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ - #item, 0, 0, is_signed_type(type), \ + #item, 0, sizeof(type), \ + is_signed_type(type), \ filter_type); \ if (ret) \ return ret;
After that I get this funky report (with only 4 cpus):
echo "'hist:key=cpu,id.syscall' > /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist'
# event histogram # # trigger info: hist:keys=cpu,id.syscall:vals=hitcount:sort=hitcount:size=2048 [active] #
{ cpu: 262161, id: sys_newfstatat [262] } hitcount: 1 { cpu: 262161, id: sys_write [ 1] } hitcount: 1 { cpu: 262161, id: sys_read [ 0] } hitcount: 1 { cpu: 262161, id: sys_recvmsg [ 47] } hitcount: 1 { cpu: 17, id: sys_inotify_init1 [294] } hitcount: 2 { cpu: 17, id: sys_mknod [133] } hitcount: 2 { cpu: 17, id: sys_setuid [105] } hitcount: 2 { cpu: 17, id: sys_getcwd [ 79] } hitcount: 2 { cpu: 17, id: sys_accept [ 43] } hitcount: 2 { cpu: 17, id: sys_chroot [161] } hitcount: 2 { cpu: 17, id: sys_fadvise64 [221] } hitcount: 2 { cpu: 17, id: sys_umount [166] } hitcount: 2 { cpu: 17, id: sys_rt_sigtimedwait [128] } hitcount: 2 { cpu: 17, id: sys_mount [165] } hitcount: 2 { cpu: 262161, id: sys_close [ 3] } hitcount: 2 { cpu: 262161, id: sys_newfstat [ 5] } hitcount: 2 { cpu: 262161, id: sys_getrandom [318] } hitcount: 3 { cpu: 262161, id: sys_open [ 2] } hitcount: 3 { cpu: 17, id: sys_setgid [106] } hitcount: 4 { cpu: 17, id: sys_inotify_add_watch [254] } hitcount: 4 { cpu: 17, id: sys_setpriority [141] } hitcount: 4 { cpu: 17, id: sys_getpriority [140] } hitcount: 4 { cpu: 17, id: sys_eventfd2 [290] } hitcount: 4 { cpu: 17, id: sys_listen [ 50] } hitcount: 4 { cpu: 17, id: sys_timerfd_create [283] } hitcount: 4 { cpu: 17, id: sys_dup [ 32] } hitcount: 4 { cpu: 17, id: sys_fstatfs [138] } hitcount: 5 { cpu: 17, id: sys_ftruncate [ 77] } hitcount: 5 { cpu: 17, id: sys_socketpair [ 53] } hitcount: 6 { cpu: 17, id: sys_name_to_handle_at [303] } hitcount: 6 { cpu: 17, id: sys_chmod [ 90] } hitcount: 6 { cpu: 17, id: sys_chown [ 92] } hitcount: 6 { cpu: 17, id: sys_readlink [ 89] } hitcount: 6 { cpu: 17, id: sys_getpid [ 39] } hitcount: 8 { cpu: 17, id: sys_unlinkat [263] } hitcount: 8 { cpu: 17, id: sys_setrlimit [160] } hitcount: 8 { cpu: 17, id: sys_sysinfo [ 99] } hitcount: 8 { cpu: 17, id: sys_getpgrp [111] } hitcount: 8 { cpu: 17, id: sys_getgroups [115] } hitcount: 10 { cpu: 17, id: sys_setsid [112] } hitcount: 10 { cpu: 17, id: sys_getpeername [ 52] } hitcount: 10 { cpu: 17, id: sys_gettid [186] } hitcount: 10 { cpu: 17, id: sys_shutdown [ 48] } hitcount: 12 { cpu: 17, id: sys_getppid [110] } hitcount: 12 { cpu: 17, id: sys_chdir [ 80] } hitcount: 14 { cpu: 17, id: sys_alarm [ 37] } hitcount: 14 { cpu: 17, id: sys_pipe [ 22] } hitcount: 14 { cpu: 17, id: sys_keyctl [250] } hitcount: 15 { cpu: 17, id: sys_sendto [ 44] } hitcount: 19 { cpu: 17, id: unknown_syscall [ 15] } hitcount: 20 { cpu: 17, id: sys_utimensat [280] } hitcount: 21 { cpu: 17, id: sys_faccessat [269] } hitcount: 22 { cpu: 17, id: sys_setgroups [116] } hitcount: 22 { cpu: 17, id: sys_pipe2 [293] } hitcount: 22 { cpu: 17, id: sys_setresgid [119] } hitcount: 24 { cpu: 17, id: sys_accept4 [288] } hitcount: 26 { cpu: 17, id: sys_setresuid [117] } hitcount: 26 { cpu: 17, id: unknown_syscall [ 59] } hitcount: 28 { cpu: 17, id: sys_futex [202] } hitcount: 29 { cpu: 17, id: unknown_syscall [312] } hitcount: 34 { cpu: 17, id: sys_waitid [247] } hitcount: 36 { cpu: 17, id: sys_select [ 23] } hitcount: 37 { cpu: 17, id: sys_fchmod [ 91] } hitcount: 49 { cpu: 17, id: sys_statfs [137] } hitcount: 52 { cpu: 17, id: sys_rename [ 82] } hitcount: 55 { cpu: 17, id: sys_openat [257] } hitcount: 64 { cpu: 17, id: sys_newuname [ 63] } hitcount: 68 { cpu: 17, id: sys_getegid [108] } hitcount: 69 { cpu: 17, id: sys_getgid [104] } hitcount: 70 { cpu: 17, id: sys_connect [ 42] } hitcount: 76 { cpu: 17, id: sys_signalfd4 [289] } hitcount: 83 { cpu: 17, id: sys_epoll_create1 [291] } hitcount: 85 { cpu: 17, id: sys_set_tid_address [218] } hitcount: 90 { cpu: 17, id: sys_kill [ 62] } hitcount: 91 { cpu: 17, id: sys_bind [ 49] } hitcount: 91 { cpu: 17, id: unknown_syscall [158] } hitcount: 102 { cpu: 17, id: sys_prctl [157] } hitcount: 117 { cpu: 17, id: unknown_syscall [ 56] } hitcount: 117 { cpu: 17, id: sys_geteuid [107] } hitcount: 119 { cpu: 17, id: sys_ioctl [ 16] } hitcount: 120 { cpu: 17, id: sys_dup2 [ 33] } hitcount: 125 { cpu: 17, id: sys_ppoll [271] } hitcount: 150 { cpu: 17, id: sys_getuid [102] } hitcount: 160 { cpu: 17, id: sys_getsockname [ 51] } hitcount: 163 { cpu: 17, id: sys_timerfd_settime [286] } hitcount: 167 { cpu: 17, id: sys_umask [ 95] } hitcount: 172 { cpu: 17, id: sys_wait4 [ 61] } hitcount: 186 { cpu: 17, id: sys_exit_group [231] } hitcount: 189 { cpu: 17, id: sys_set_robust_list [273] } hitcount: 191 { cpu: 17, id: sys_lseek [ 8] } hitcount: 196 { cpu: 17, id: sys_rmdir [ 84] } hitcount: 210 { cpu: 17, id: sys_mkdir [ 83] } hitcount: 211 { cpu: 17, id: sys_setsockopt [ 54] } hitcount: 215 { cpu: 17, id: sys_poll [ 7] } hitcount: 221 { cpu: 17, id: sys_unlink [ 87] } hitcount: 221 { cpu: 17, id: sys_getsockopt [ 55] } hitcount: 222 { cpu: 17, id: sys_socket [ 41] } hitcount: 229 { cpu: 17, id: sys_munmap [ 11] } hitcount: 260 { cpu: 17, id: sys_getrlimit [ 97] } hitcount: 318 { cpu: 17, id: sys_rt_sigprocmask [ 14] } hitcount: 344 { cpu: 17, id: sys_access [ 21] } hitcount: 380 { cpu: 17, id: sys_epoll_ctl [233] } hitcount: 391 { cpu: 17, id: sys_readlinkat [267] } hitcount: 416 { cpu: 17, id: sys_newstat [ 4] } hitcount: 582 { cpu: 17, id: sys_write [ 1] } hitcount: 694 { cpu: 17, id: sys_rt_sigaction [ 13] } hitcount: 732 { cpu: 17, id: sys_brk [ 12] } hitcount: 736 { cpu: 17, id: sys_fcntl [ 72] } hitcount: 818 { cpu: 17, id: sys_sendmsg [ 46] } hitcount: 850 { cpu: 17, id: sys_getrandom [318] } hitcount: 908 { cpu: 17, id: sys_getdents [ 78] } hitcount: 937 { cpu: 17, id: sys_clock_gettime [228] } hitcount: 1015 { cpu: 17, id: sys_epoll_wait [232] } hitcount: 1394 { cpu: 17, id: sys_newfstatat [262] } hitcount: 1544 { cpu: 17, id: sys_mprotect [ 10] } hitcount: 1592 { cpu: 17, id: sys_recvmsg [ 47] } hitcount: 1676 { cpu: 17, id: sys_newlstat [ 6] } hitcount: 1897 { cpu: 17, id: sys_mmap [ 9] } hitcount: 2474 { cpu: 17, id: sys_newfstat [ 5] } hitcount: 3301 { cpu: 17, id: sys_read [ 0] } hitcount: 3570 { cpu: 17, id: sys_open [ 2] } hitcount: 4191 { cpu: 17, id: sys_close [ 3] } hitcount: 4194
Totals: Hits: 40593 Entries: 122 Dropped: 0
Still trying to figure out what's going wrong here.
cheers, daniel
On 08/30/2016 04:01 PM, Daniel Wagner wrote:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 03c0a48..ab8958f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -152,7 +152,8 @@ EXPORT_SYMBOL_GPL(trace_define_field); #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \
#item, 0, 0, is_signed_type(type), \
#item, 0, sizeof(type), \
if (ret) \ return ret;is_signed_type(type), \ filter_type); \
After that I get this funky report (with only 4 cpus):
Just setting the size of the type is not enough. The hist_field_* getter function want to know the offset too:
#define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ { \ type *addr = (type *)(event + hist_field->field->offset); \ \ return (u64)(unsigned long)*addr; \ }
The problem we have here is that there is no cpu field in the event record.
On 08/30/2016 04:20 PM, Daniel Wagner wrote:
Just setting the size of the type is not enough. The hist_field_* getter function want to know the offset too:
With this hack here it should work. The COMM generic field is handled via the tracing_map_ops. We could do it also there but than we need to a condition in the trace_map_ops if it is COMM or CPU. This shortcut here avoids it:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 03c0a48..ab8958f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -152,7 +152,8 @@ EXPORT_SYMBOL_GPL(trace_define_field);
#define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ - #item, 0, 0, is_signed_type(type), \ + #item, 0, sizeof(type), \ + is_signed_type(type), \ filter_type); \ if (ret) \ return ret; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f3a960e..77073b7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -75,6 +75,11 @@ static u64 hist_field_log2(struct hist_field *hist_field, void *event) return (u64) ilog2(roundup_pow_of_two(val)); }
+static u64 hist_field_cpu(struct hist_field *hist_field, void *event) +{ + return (u64) smp_processor_id(); +} + #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ { \ @@ -119,6 +124,7 @@ enum hist_field_flags { HIST_FIELD_FL_SYSCALL = 128, HIST_FIELD_FL_STACKTRACE = 256, HIST_FIELD_FL_LOG2 = 512, + HIST_FIELD_FL_CPU = 1024, };
struct hist_trigger_attrs { @@ -371,6 +377,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, goto out; }
+ if (flags & HIST_FIELD_FL_CPU) { + hist_field->fn = hist_field_cpu; + goto out; + } + if (WARN_ON_ONCE(!field)) goto out;
@@ -556,6 +567,11 @@ static int create_key_field(struct hist_trigger_data *hist_data, key_size = MAX_FILTER_STR_VAL; else key_size = field->size; + + // strcmp(field_name, "cpu") would also work to figure + // out if this is a one of the generic fields. + if (field->filter_type == FILTER_CPU) + flags |= HIST_FIELD_FL_CPU; }
hist_data->fields[key_idx] = create_hist_field(field, flags);
On Tue, 30 Aug 2016 17:02:03 +0200 Daniel Wagner daniel.wagner@bmw-carit.de wrote:
On 08/30/2016 04:20 PM, Daniel Wagner wrote:
Just setting the size of the type is not enough. The hist_field_* getter function want to know the offset too:
With this hack here it should work. The COMM generic field is handled via the tracing_map_ops. We could do it also there but than we need to a condition in the trace_map_ops if it is COMM or CPU. This shortcut here avoids it:
I think the tracing_map_ops is the proper fix. I'd like to keep comm and cpu identical, as they are both basically "meta fields".
-- Steve
Hi Daniel/Steven,
On 30 August 2016 at 20:32, Daniel Wagner daniel.wagner@bmw-carit.de wrote:
On 08/30/2016 04:20 PM, Daniel Wagner wrote:
Just setting the size of the type is not enough. The hist_field_* getter function want to know the offset too:
With this hack here it should work. The COMM generic field is handled via the tracing_map_ops. We could do it also there but than we need to a condition in the trace_map_ops if it is COMM or CPU. This shortcut here avoids it:
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f3a960e..77073b7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -556,6 +567,11 @@ static int create_key_field(struct hist_trigger_data *hist_data, key_size = MAX_FILTER_STR_VAL; else key_size = field->size;
// strcmp(field_name, "cpu") would also work to figure
// out if this is a one of the generic fields.
if (field->filter_type == FILTER_CPU)
flags |= HIST_FIELD_FL_CPU; } hist_data->fields[key_idx] = create_hist_field(field, flags);
I applied Daniel's fix and it seems to work. Can we use the following instead on top of your patch? Like how the other keys are compared against respective fields?
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 46203b7..963a121 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -537,6 +537,12 @@ static int create_key_field(struct hist_trigger_data *hist_data, } else { char *field_name = strsep(&field_str, ".");
+ /* Cannot keep these 2 lines in side the if() below + * as field_str would be NULL for the key 'cpu' + */ + if (strcmp(field_name, "cpu") == 0) + flags |= HIST_FIELD_FL_CPU; + if (field_str) { if (strcmp(field_str, "hex") == 0) flags |= HIST_FIELD_FL_HEX; @@ -568,11 +574,6 @@ static int create_key_field(struct hist_trigger_data *hist_data, else key_size = field->size;
- // strcmp(field_name, "cpu") would also work to figure - // out if this is a one of the generic fields. - if (field->filter_type == FILTER_CPU) - flags |= HIST_FIELD_FL_CPU; - }
hist_data->fields[key_idx] = create_hist_field(field, flags);
- Binoy
Latencies of missed timer offsets. Generate a histogram of missed timer offsets in microseconds. This will be a based along with irq and preemption latencies to calculate the effective process wakeup latencies.
The following filter(s) may be used
'hist:key=common_pid.execname' 'hist:key=common_pid.execname,cpu:val=toffset,hitcount'
Signed-off-by: Binoy Jayan binoy.jayan@linaro.org --- include/linux/hrtimer.h | 3 +++ include/trace/events/latency.h | 31 +++++++++++++++++++++++++++++++ kernel/time/hrtimer.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+)
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5e00f80..e09de14 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -104,6 +104,9 @@ struct hrtimer { struct hrtimer_clock_base *base; u8 state; u8 is_rel; +#if defined(CONFIG_PREEMPT_TRACER) || defined(CONFIG_IRQSOFF_TRACER) + ktime_t praecox; +#endif #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; diff --git a/include/trace/events/latency.h b/include/trace/events/latency.h index 1c34536..746f035 100644 --- a/include/trace/events/latency.h +++ b/include/trace/events/latency.h @@ -31,6 +31,37 @@ DEFINE_EVENT(latency_template, latency_preempt, TP_PROTO(int ltype, int cpu, cycles_t latency), TP_ARGS(ltype, cpu, latency));
+TRACE_EVENT(latency_hrtimer_interrupt, + + TP_PROTO(int cpu, long long toffset, struct task_struct *curr, + struct task_struct *task), + + TP_ARGS(cpu, toffset, curr, task), + + TP_STRUCT__entry( + __field(int, cpu) + __field(long long, toffset) + __array(char, ccomm, TASK_COMM_LEN) + __field(int, cprio) + __array(char, tcomm, TASK_COMM_LEN) + __field(int, tprio) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->toffset = toffset; + memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN); + __entry->cprio = curr->prio; + memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>", + task != NULL ? TASK_COMM_LEN : 7); + __entry->tprio = task != NULL ? task->prio : -1; + ), + + TP_printk("cpu=%d toffset=%lld curr=%s[%d] thread=%s[%d]", + __entry->cpu, __entry->toffset, __entry->ccomm, + __entry->cprio, __entry->tcomm, __entry->tprio) +); + #endif /* _TRACE_HIST_H */
/* This part must be outside protection */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9ba7c82..1a96e34 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -53,9 +53,12 @@ #include <asm/uaccess.h>
#include <trace/events/timer.h> +#include <trace/events/latency.h>
#include "tick-internal.h"
+static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer); + /* * The timer bases: * @@ -960,6 +963,38 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, return tim; }
+static inline void trace_latency_mark_ts(struct hrtimer *timer, + struct hrtimer_clock_base *new_base, + ktime_t tim) +{ +#if defined(CONFIG_PREEMPT_TRACER) || defined(CONFIG_IRQSOFF_TRACER) + if (trace_latency_hrtimer_interrupt_enabled()) { + ktime_t now = new_base->get_time(); + + if (ktime_to_ns(tim) < ktime_to_ns(now)) + timer->praecox = now; + else + timer->praecox = ktime_set(0, 0); + } +#endif +} + +static inline void trace_missed_hrtimer(struct hrtimer *timer, ktime_t basenow) +{ +#if defined(CONFIG_PREEMPT_TRACER) || defined(CONFIG_IRQSOFF_TRACER) + if (trace_latency_hrtimer_interrupt_enabled()) + trace_latency_hrtimer_interrupt(raw_smp_processor_id(), + ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ? + timer->praecox : hrtimer_get_expires(timer), + basenow)), + current, + timer->function == hrtimer_wakeup ? + container_of(timer, struct hrtimer_sleeper, + timer)->task : NULL); +#endif + +} + /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU * @timer: the timer to be added @@ -992,6 +1027,8 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
timer_stats_hrtimer_set_start_info(timer);
+ trace_latency_mark_ts(timer, new_base, tim); + leftmost = enqueue_hrtimer(timer, new_base); if (!leftmost) goto unlock; @@ -1284,6 +1321,8 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
timer = container_of(node, struct hrtimer, node);
+ trace_missed_hrtimer(timer, basenow); + /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the
linaro-kernel@lists.linaro.org