Added user space perf functionality to translate CoreSight traces into last branch stack events. To invoke the new functionality, use the perf inject tool with --itrace=il. For example, to translate the ETM trace from perf.data into last branch records in a new inj.data file:
The parameter to 'l' 128 is parsed as recording the last 128 branches from the sampling point. Also note that right now the parameter to 'i' 100us is ignored, and we end up translating all the branches in the trace.
Signed-off-by: Sebastian Pop s.pop@samsung.com Signed-off-by: Brian Rzycki b.rzycki@samsung.com --- tools/perf/Documentation/cs-etm.txt | 38 +++++++++ tools/perf/util/cs-etm.c | 161 +++++++++++++++++++++++++++++++++++- 2 files changed, 195 insertions(+), 4 deletions(-) create mode 100644 tools/perf/Documentation/cs-etm.txt
diff --git a/tools/perf/Documentation/cs-etm.txt b/tools/perf/Documentation/cs-etm.txt new file mode 100644 index 0000000..bf4d427 --- /dev/null +++ b/tools/perf/Documentation/cs-etm.txt @@ -0,0 +1,38 @@ +ARM CoreSight Embedded Trace Macrocell +====================================== + +Overview +======== + +[To be done.] + + +perf inject +=========== + +perf inject also accepts the --itrace option in which case tracing data is +removed and replaced with the synthesized events. e.g. + + perf inject --itrace -i perf.data -o perf.data.new + +Below is an example of using ARM ETM for autoFDO. It requires autofdo +(https://github.com/google/autofdo) and gcc version 5. The bubble +sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tutorial). + + $ gcc-5 -O3 sort.c -o sort_optimized + $ taskset -c 2 ./sort_optimized + Bubble sorting array of 30000 elements + 5910 ms + + $ perf record -e cs_etm/@20070000.etr/u --per-thread taskset -c 2 ./sort + Bubble sorting array of 30000 elements + 12543 ms + [ perf record: Woken up 35 times to write data ] + [ perf record: Captured and wrote 69.640 MB perf.data ] + + $ perf inject -i perf.data -o inj.data --itrace=il64 --strip + $ create_gcov --binary=./sort --profile=inj.data --gcov=sort.gcov -gcov_version=1 + $ gcc-5 -O3 -fauto-profile=sort.gcov sort.c -o sort_autofdo + $ taskset -c 2 ./sort_autofdo + Bubble sorting array of 30000 elements + 5806 ms diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 1a345c28..b7d6b54 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -89,6 +89,14 @@ struct cs_etm_queue { u64 offset; bool eot; bool kernel_mapped; + // Stack of branches in reverse chronological order that will be copied + // to a last branch event sample. + struct branch_stack *last_branch; + // A circular buffer used to record last branches as they are decoded + // from the trace. + struct branch_stack *last_branch_rb; + // Position in the circular buffer where the last branch has been inserted. + size_t last_branch_pos; };
static int cs_etm__get_trace(struct cs_etm_buffer *buff, struct cs_etm_queue *etmq); @@ -209,6 +217,8 @@ static void cs_etm__free_queue(void *priv) thread__zput(etmq->thread); cs_etm_decoder__free(etmq->decoder); zfree(&etmq->event_buf); + zfree(&etmq->last_branch); + zfree(&etmq->last_branch_rb); zfree(&etmq->chain); free(etmq); } @@ -373,6 +383,19 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, etmq->chain = NULL; }
+ if (etm->synth_opts.last_branch) { + size_t sz = sizeof(struct branch_stack); + + sz += etm->synth_opts.last_branch_sz * + sizeof(struct branch_entry); + etmq->last_branch = zalloc(sz); + if (!etmq->last_branch) + goto out_free; + etmq->last_branch_rb = zalloc(sz); + if (!etmq->last_branch_rb) + goto out_free; + } + etmq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE); if (!etmq->event_buf) goto out_free; @@ -419,6 +442,8 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm,
out_free: zfree(&etmq->event_buf); + zfree(&etmq->last_branch); + zfree(&etmq->last_branch_rb); zfree(&etmq->chain); free(etmq); return NULL; @@ -514,6 +539,82 @@ static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm) return 0; }
+static inline void cs_etm__copy_last_branch_rb(struct cs_etm_queue *etmq) +{ + struct branch_stack *bs_src = etmq->last_branch_rb; + struct branch_stack *bs_dst = etmq->last_branch; + size_t nr = 0; + + // Set the number of records before early exit: ->nr is used to + // determine how many branches to copy from ->entries. + bs_dst->nr = bs_src->nr; + + // Early exit when there is nothing to copy. + if (!bs_src->nr) + return; + + // As bs_src->entries is a circular buffer, we need to copy from it in + // two steps. First, copy the branches from the most recently inserted + // branch ->last_branch_pos until the end of bs_src->entries buffer. + nr = etmq->etm->synth_opts.last_branch_sz - etmq->last_branch_pos; + memcpy(&bs_dst->entries[0], + &bs_src->entries[etmq->last_branch_pos], + sizeof(struct branch_entry) * nr); + + // If we wrapped around at least once, the branches from the beginning + // of the bs_src->entries buffer and until the ->last_branch_pos element + // are older valid branches: copy them over. The total number of + // branches copied over will be equal to the number of branches asked by + // the user in last_branch_sz. + if (bs_src->nr >= etmq->etm->synth_opts.last_branch_sz) { + memcpy(&bs_dst->entries[nr], + &bs_src->entries[0], + sizeof(struct branch_entry) * etmq->last_branch_pos); + } +} + +static inline void cs_etm__reset_last_branch_rb(struct cs_etm_queue *etmq) +{ + etmq->last_branch_pos = 0; + etmq->last_branch_rb->nr = 0; +} + +static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq, + struct cs_etm_packet *packet) +{ + struct branch_stack *bs = etmq->last_branch_rb; + struct branch_entry *be; + + // The branches are recorded in a circular buffer in reverse + // chronological order: we start recording from the last element of the + // buffer down. After writing the first element of the stack, move the + // insert position back to the end of the buffer. + if (!etmq->last_branch_pos) + etmq->last_branch_pos = etmq->etm->synth_opts.last_branch_sz; + + etmq->last_branch_pos -= 1; + + be = &bs->entries[etmq->last_branch_pos]; + be->from = packet->start_addr; + be->to = packet->end_addr; + /* No support for mispredict */ + be->flags.mispred = 0; + be->flags.predicted = 1; + + // Increment bs->nr until reaching the number of last branches asked by + // the user on the command line. + if (bs->nr < etmq->etm->synth_opts.last_branch_sz) + bs->nr += 1; +} + +static int cs_etm__inject_event(union perf_event *event, + struct perf_sample *sample, u64 type, + bool swapped) +{ + event->header.size = perf_event__sample_event_size(sample, type, 0); + return perf_event__synthesize_sample(event, type, 0, sample, swapped); +} + #if 0 struct cs_etm_cache_entry { struct auxtrace_cache_entry entry; @@ -630,6 +731,11 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
//etmq->last_insn_cnt = etmq->state->tot_insn_cnt;
+ if (etm->synth_opts.last_branch) { + cs_etm__copy_last_branch_rb(etmq); + sample.branch_stack = etmq->last_branch; + } + #if 0 { struct addr_location al; @@ -689,12 +795,24 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, } #endif
+ if (etm->synth_opts.inject) { + ret = cs_etm__inject_event(event, &sample, + etm->instructions_sample_type, + etm->synth_needs_swap); + if (ret) + return ret; + } + ret = perf_session__deliver_synth_event(etm->session,event, &sample);
if (ret) { pr_err("CS ETM Trace: failed to deliver instruction event, error %d\n", ret);
} + + if (etm->synth_opts.last_branch) + cs_etm__reset_last_branch_rb(etmq); + return ret; }
@@ -783,6 +901,8 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm, attr.config = PERF_COUNT_HW_INSTRUCTIONS; attr.sample_period = etm->synth_opts.period; etm->instructions_sample_period = attr.sample_period; + if (etm->synth_opts.last_branch) + attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; err = cs_etm__synth_event(session, &attr, id);
if (err) { @@ -800,7 +920,7 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm, return 0; }
-static int cs_etm__sample(struct cs_etm_queue *etmq) +static int cs_etm__sample(struct cs_etm_queue *etmq, int *cpu) { //const struct cs_etm_state *state = etmq->state; struct cs_etm_packet packet; @@ -809,8 +929,24 @@ static int cs_etm__sample(struct cs_etm_queue *etmq)
err = cs_etm_decoder__get_packet(etmq->decoder,&packet); // if there is no sample, it returns err = -1, no real error - - if (!err && packet.sample_type & CS_ETM_RANGE) { + if (err) + return err; + if (etmq->etm->synth_opts.last_branch) { + *cpu = packet.cpu; + + // FIXME: as the trace sampling does not work for now, (for + // example perf inject --itrace=i100us will not generate events + // every 100 micro-seconds), generate a last branch event after + // having decoded last_branch_sz branch samples. This condition + // should be rewritten as "if reached sampling period". + if (etmq->last_branch_rb->nr == + etmq->etm->synth_opts.last_branch_sz) { + err = cs_etm__synth_instruction_sample(etmq, &packet); + if (err) + return err; + } + cs_etm__update_last_branch_rb(etmq, &packet); + } else if (packet.sample_type & CS_ETM_RANGE) { err = cs_etm__synth_instruction_sample(etmq,&packet); if (err) return err; @@ -823,6 +959,7 @@ static int cs_etm__run_decoder(struct cs_etm_queue *etmq, u64 *timestamp) struct cs_etm_buffer buffer; size_t buffer_used; int err = 0; + int cpu = 0;
/* Go through each buffer in the queue and decode them one by one */ more: @@ -843,8 +980,24 @@ static int cs_etm__run_decoder(struct cs_etm_queue *etmq, u64 *timestamp) etmq->offset += processed; buffer_used += processed; if (!err) - cs_etm__sample(etmq); + cs_etm__sample(etmq, &cpu); } while (!etmq->eot && (buffer.len > buffer_used)); + + // Generate a last branch event for the branches left in the circular + // buffer at the end of the trace. + if (etmq->etm->synth_opts.last_branch) { + struct cs_etm_packet packet; + struct branch_stack *bs = etmq->last_branch_rb; + struct branch_entry *be = &bs->entries[etmq->last_branch_pos]; + packet.cpu = cpu; + packet.start_addr = be->from; + packet.end_addr = be->to; + + err = cs_etm__synth_instruction_sample(etmq, &packet); + if (err) + return err; + } + goto more;
(void) timestamp;