On 20 December 2016 at 21:51, Sebastian Pop sebpop.llvm@gmail.com wrote:
Added user space perf functionality to translate CoreSight traces into last branch stack events. To invoke the new functionality, use the perf inject tool with --itrace=il. For example, to translate the ETM trace from perf.data into last branch records in a new inj.data file:
$ perf inject --itrace=i100usl128 -i perf.data -o perf.data.new
The parameter to 'l' 128 is parsed as recording the last 128 branches from the sampling point. Also note that right now the parameter to 'i' 100us is ignored, and we end up translating all the branches in the trace.
Hello Sebastian,
I was in the process of adding your patches to the latest branch on github when I noticed two things:
1) checkpatch.pl gives me 144 errors and 96 warnings. Most of them are related to a '$' character found at the end of each line (see attached file). When looking at the patch using 'vi' the spurious characters can't be seen but checkpatch definitely catches them. Do you see the same thing on your side? There is also the matter of space vs. tabulations at the beginning of each line - please use tabs. We are in the process of fixing the style hiccups in the original code.
2) the address in the "To" line, i.e "sebpop.llvm@gmail.com" doesn't match the one in the S.O.B, i.e "s.pop@samsung.com". Which one you use is of no importance, at least to me, but they have to be the same.
Thanks, Mathieu
Signed-off-by: Sebastian Pop s.pop@samsung.com Signed-off-by: Brian Rzycki b.rzycki@samsung.com
Documentation/trace/coresight.txt | 31 +++++++ tools/perf/util/cs-etm.c | 181 +++++++++++++++++++++++++++++++++++++- 2 files changed, 209 insertions(+), 3 deletions(-)
diff --git a/Documentation/trace/coresight.txt b/Documentation/trace/coresight.txt index a2e7ccb..acb5795 100644 --- a/Documentation/trace/coresight.txt +++ b/Documentation/trace/coresight.txt @@ -440,3 +440,34 @@ Details on how to use the generic STM API can be found here [2].
[1]. Documentation/ABI/testing/sysfs-bus-coresight-devices-stm [2]. Documentation/trace/stm.txt
+Generating coverage files for Feedback Directed Optimization: AutoFDO +---------------------------------------------------------------------
+perf inject accepts the --itrace option in which case tracing data is +removed and replaced with the synthesized events. e.g.
perf inject --itrace -i perf.data -o perf.data.new
+Below is an example of using ARM ETM for autoFDO. It requires autofdo +(https://github.com/google/autofdo) and gcc version 5. The bubble +sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tutorial).
$ gcc-5 -O3 sort.c -o sort
$ taskset -c 2 ./sort
Bubble sorting array of 30000 elements
5910 ms
$ perf record -e cs_etm/@20070000.etr/u --per-thread taskset -c 2 ./sort
Bubble sorting array of 30000 elements
12543 ms
[ perf record: Woken up 35 times to write data ]
[ perf record: Captured and wrote 69.640 MB perf.data ]
$ perf inject -i perf.data -o inj.data --itrace=il64 --strip
$ create_gcov --binary=./sort --profile=inj.data --gcov=sort.gcov -gcov_version=1
$ gcc-5 -O3 -fauto-profile=sort.gcov sort.c -o sort_autofdo
$ taskset -c 2 ./sort_autofdo
Bubble sorting array of 30000 elements
5806 ms
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 1a345c28..da1b0f3 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -89,6 +89,20 @@ struct cs_etm_queue { u64 offset; bool eot; bool kernel_mapped;
/*
* Stack of branches in reverse chronological order that will be copied
* to a last branch event sample.
*/
struct branch_stack *last_branch;
/*
* A circular buffer used to record last branches as they are decoded
* from the trace.
*/
struct branch_stack *last_branch_rb;
/*
* Position in the circular buffer where the last branch has been inserted.
*/
size_t last_branch_pos;
};
static int cs_etm__get_trace(struct cs_etm_buffer *buff, struct cs_etm_queue *etmq); @@ -209,6 +223,8 @@ static void cs_etm__free_queue(void *priv) thread__zput(etmq->thread); cs_etm_decoder__free(etmq->decoder); zfree(&etmq->event_buf);
zfree(&etmq->last_branch);
zfree(&etmq->last_branch_rb); zfree(&etmq->chain); free(etmq);
} @@ -373,6 +389,19 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, etmq->chain = NULL; }
if (etm->synth_opts.last_branch) {
size_t sz = sizeof(struct branch_stack);
sz += etm->synth_opts.last_branch_sz *
sizeof(struct branch_entry);
etmq->last_branch = zalloc(sz);
if (!etmq->last_branch)
goto out_free;
etmq->last_branch_rb = zalloc(sz);
if (!etmq->last_branch_rb)
goto out_free;
}
etmq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE); if (!etmq->event_buf) goto out_free;
@@ -419,6 +448,8 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm,
out_free: zfree(&etmq->event_buf);
zfree(&etmq->last_branch);
zfree(&etmq->last_branch_rb); zfree(&etmq->chain); free(etmq); return NULL;
@@ -514,6 +545,94 @@ static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm) return 0; }
+static inline void cs_etm__copy_last_branch_rb(struct cs_etm_queue *etmq) +{
struct branch_stack *bs_src = etmq->last_branch_rb;
struct branch_stack *bs_dst = etmq->last_branch;
size_t nr = 0;
/*
* Set the number of records before early exit: ->nr is used to
* determine how many branches to copy from ->entries.
*/
bs_dst->nr = bs_src->nr;
/*
* Early exit when there is nothing to copy.
*/
if (!bs_src->nr)
return;
/*
* As bs_src->entries is a circular buffer, we need to copy from it in
* two steps. First, copy the branches from the most recently inserted
* branch ->last_branch_pos until the end of bs_src->entries buffer.
*/
nr = etmq->etm->synth_opts.last_branch_sz - etmq->last_branch_pos;
memcpy(&bs_dst->entries[0],
&bs_src->entries[etmq->last_branch_pos],
sizeof(struct branch_entry) * nr);
/*
* If we wrapped around at least once, the branches from the beginning
* of the bs_src->entries buffer and until the ->last_branch_pos element
* are older valid branches: copy them over. The total number of
* branches copied over will be equal to the number of branches asked by
* the user in last_branch_sz.
*/
if (bs_src->nr >= etmq->etm->synth_opts.last_branch_sz) {
memcpy(&bs_dst->entries[nr],
&bs_src->entries[0],
sizeof(struct branch_entry) * etmq->last_branch_pos);
}
+}
+static inline void cs_etm__reset_last_branch_rb(struct cs_etm_queue *etmq) +{
etmq->last_branch_pos = 0;
etmq->last_branch_rb->nr = 0;
+}
+static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq,
struct cs_etm_packet *packet)
+{
struct branch_stack *bs = etmq->last_branch_rb;
struct branch_entry *be;
/*
* The branches are recorded in a circular buffer in reverse
* chronological order: we start recording from the last element of the
* buffer down. After writing the first element of the stack, move the
* insert position back to the end of the buffer.
*/
if (!etmq->last_branch_pos)
etmq->last_branch_pos = etmq->etm->synth_opts.last_branch_sz;
etmq->last_branch_pos -= 1;
be = &bs->entries[etmq->last_branch_pos];
be->from = packet->start_addr;
be->to = packet->end_addr;
/* No support for mispredict */
be->flags.mispred = 0;
be->flags.predicted = 1;
/*
* Increment bs->nr until reaching the number of last branches asked by
* the user on the command line.
*/
if (bs->nr < etmq->etm->synth_opts.last_branch_sz)
bs->nr += 1;
+}
+static int cs_etm__inject_event(union perf_event *event,
struct perf_sample *sample, u64 type,
bool swapped)
+{
event->header.size = perf_event__sample_event_size(sample, type, 0);
return perf_event__synthesize_sample(event, type, 0, sample, swapped);
+}
#if 0 struct cs_etm_cache_entry { struct auxtrace_cache_entry entry; @@ -630,6 +749,11 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
//etmq->last_insn_cnt = etmq->state->tot_insn_cnt;
if (etm->synth_opts.last_branch) {
cs_etm__copy_last_branch_rb(etmq);
sample.branch_stack = etmq->last_branch;
}
#if 0 { struct addr_location al; @@ -689,12 +813,24 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, } #endif
if (etm->synth_opts.inject) {
ret = cs_etm__inject_event(event, &sample,
etm->instructions_sample_type,
etm->synth_needs_swap);
if (ret)
return ret;
}
ret = perf_session__deliver_synth_event(etm->session,event, &sample); if (ret) { pr_err("CS ETM Trace: failed to deliver instruction event, error %d\n", ret); }
if (etm->synth_opts.last_branch)
cs_etm__reset_last_branch_rb(etmq);
return ret;
}
@@ -783,6 +919,8 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm, attr.config = PERF_COUNT_HW_INSTRUCTIONS; attr.sample_period = etm->synth_opts.period; etm->instructions_sample_period = attr.sample_period;
if (etm->synth_opts.last_branch)
attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; err = cs_etm__synth_event(session, &attr, id); if (err) {
@@ -800,7 +938,7 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm, return 0; }
-static int cs_etm__sample(struct cs_etm_queue *etmq) +static int cs_etm__sample(struct cs_etm_queue *etmq, int *cpu) { //const struct cs_etm_state *state = etmq->state; struct cs_etm_packet packet; @@ -809,8 +947,26 @@ static int cs_etm__sample(struct cs_etm_queue *etmq)
err = cs_etm_decoder__get_packet(etmq->decoder,&packet); // if there is no sample, it returns err = -1, no real error
if (err)
return err;
if (etmq->etm->synth_opts.last_branch) {
*cpu = packet.cpu;
if (!err && packet.sample_type & CS_ETM_RANGE) {
/*
* FIXME: as the trace sampling does not work for now, (for
* example perf inject --itrace=i100us will not generate events
* every 100 micro-seconds), generate a last branch event after
* having decoded last_branch_sz branch samples. This condition
* should be rewritten as "if reached sampling period".
*/
if (etmq->last_branch_rb->nr ==
etmq->etm->synth_opts.last_branch_sz) {
err = cs_etm__synth_instruction_sample(etmq, &packet);
if (err)
return err;
}
cs_etm__update_last_branch_rb(etmq, &packet);
} else if (packet.sample_type & CS_ETM_RANGE) { err = cs_etm__synth_instruction_sample(etmq,&packet); if (err) return err;
@@ -823,6 +979,7 @@ static int cs_etm__run_decoder(struct cs_etm_queue *etmq, u64 *timestamp) struct cs_etm_buffer buffer; size_t buffer_used; int err = 0;
int cpu = 0; /* Go through each buffer in the queue and decode them one by one */
more: @@ -843,8 +1000,26 @@ static int cs_etm__run_decoder(struct cs_etm_queue *etmq, u64 *timestamp) etmq->offset += processed; buffer_used += processed; if (!err)
cs_etm__sample(etmq);
cs_etm__sample(etmq, &cpu); } while (!etmq->eot && (buffer.len > buffer_used));
/*
* Generate a last branch event for the branches left in the circular
* buffer at the end of the trace.
*/
if (etmq->etm->synth_opts.last_branch) {
struct cs_etm_packet packet;
struct branch_stack *bs = etmq->last_branch_rb;
struct branch_entry *be = &bs->entries[etmq->last_branch_pos];
packet.cpu = cpu;
packet.start_addr = be->from;
packet.end_addr = be->to;
err = cs_etm__synth_instruction_sample(etmq, &packet);
if (err)
return err;
}
goto more;
(void) timestamp;
-- 2.7.4