Hi Mike,
Yes, moving the instruction counting to the decoder would provide a useful performance boost. I think we still will need to do a scan through the instructions at the top level in cs_etm__instr_addr() - this is called to find the address of the Nth instruction in a range (used when a sample is to be generated mid way through an executed range of instructions), but that's relatively low impact as it will happen infrequently.
Thanks for the feedback Mathieu - I've updated it with your suggestions. I put the patch out as an RFC because the topic had come up a few times on the mailing list / IRC recently, but it still needs more testing.
Thanks
Rob
-----Original Message----- From: Mike Leach mike.leach@linaro.org Sent: 03 May 2018 11:27 To: Mathieu Poirier mathieu.poirier@linaro.org Cc: Robert Walker Robert.Walker@arm.com; coresight@lists.linaro.org Subject: Re: [RFC PATCH] perf: Support for Arm A32/T32 instruction sets in CoreSight trace
Hi Rob,
A general comment - given that passing the last instruction size in the address range packet would be trivial, that counting instructions and adding a count field would not be much more difficult, and also given the previously noted poor performance of cs_etm__mem_access() which is now being called twice per T32 instruction, would it not be better to co-ordinate this patch set with an updated decoder providing the above packet extentions?
Mike
On 2 May 2018 at 23:11, Mathieu Poirier mathieu.poirier@linaro.org wrote:
Hi Robert,
On 2 May 2018 at 11:50, Robert Walker robert.walker@arm.com wrote:
This patch adds support for generating instruction samples from trace of AArch32 programs using the A32 and T32 instruction sets.
A32 has a fixed 4 byte instruction size, so the conversion between addresses and instruction counts is handled in the same way as A64. T32 has variable 2 or 4 byte instruction size, so the conversion between addresses and instruction counts requires examination of the
instruction bytes.
Signed-off-by: Robert Walker robert.walker@arm.com
This patch is an initial draft of support for trace of A32 / T32 programs. We need to pass the instruction set for each executed range from the decoder and then use the instruction set information when converting between executed address ranges and instruction counts - for T32 we need to look at the individual instructions to determine if they
are 2 or 4 bytes.
I'm a little puzzled with the "initial draft" part... What else needs to be done and any reason for not sending it to the broad mailing list?
Aside from the trivial requests below I'm good with this work.
Thanks, Mathieu
Regards
Rob
tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 21 ++++++ tools/perf/util/cs-etm-decoder/cs-etm-decoder.h | 8 +++ tools/perf/util/cs-etm.c | 93 +++++++++++++++---------- 3 files changed, 86 insertions(+), 36 deletions(-)
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index 640af88..fbd356f 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -256,6 +256,7 @@ static void cs_etm_decoder__clear_buffer(struct
cs_etm_decoder *decoder)
for (i = 0; i < MAX_BUFFER; i++) { decoder->packet_buffer[i].start_addr = 0xdeadbeefdeadbeefUL; decoder->packet_buffer[i].end_addr =
0xdeadbeefdeadbeefUL;
decoder->packet_buffer[i].isa = CS_ETM_ISA_UNKNOWN; decoder->packet_buffer[i].last_instr_taken_branch = false; decoder->packet_buffer[i].exc = false; decoder->packet_buffer[i].exc_ret = false; @@ -285,6
+286,7 @@ cs_etm_decoder__buffer_packet(struct cs_etm_decoder
*decoder,
decoder->packet_count++; decoder->packet_buffer[et].sample_type = sample_type;
decoder->packet_buffer[et].isa = CS_ETM_ISA_UNKNOWN; decoder->packet_buffer[et].exc = false; decoder->packet_buffer[et].exc_ret = false; decoder->packet_buffer[et].cpu = *((int *)inode->priv); @@
-314,6 +316,25 @@ cs_etm_decoder__buffer_range(struct
cs_etm_decoder
*decoder,
packet->start_addr = elem->st_addr; packet->end_addr = elem->en_addr;
switch (elem->isa) {
case ocsd_isa_aarch64:
packet->isa = CS_ETM_ISA_A64;
break;
case ocsd_isa_arm:
packet->isa = CS_ETM_ISA_A32;
break;
case ocsd_isa_thumb2:
packet->isa = CS_ETM_ISA_T32;
break;
case ocsd_isa_tee:
case ocsd_isa_jazelle:
case ocsd_isa_custom:
case ocsd_isa_unknown:
default:
packet->isa = CS_ETM_ISA_UNKNOWN;
}
switch (elem->last_i_type) { case OCSD_INSTR_BR: case OCSD_INSTR_BR_INDIRECT:
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h index 743f5f4..630292e 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h @@ -27,10 +27,18 @@ enum cs_etm_sample_type { CS_ETM_TRACE_ON = 1 << 1, };
+enum cs_etm_isa {
CS_ETM_ISA_UNKNOWN,
CS_ETM_ISA_A64,
CS_ETM_ISA_A32,
CS_ETM_ISA_T32,
+};
struct cs_etm_packet { enum cs_etm_sample_type sample_type; u64 start_addr; u64 end_addr;
enum cs_etm_isa isa;
Move this up just below the declaration of @sample_type.
u8 last_instr_taken_branch; u8 exc; u8 exc_ret;
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 1b0d422..004ace3 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -32,14 +32,6 @@
#define MAX_TIMESTAMP (~0ULL)
-/*
- A64 instructions are always 4 bytes
- Only A64 is supported, so can use this constant for converting
between
- addresses and instruction counts, calculting offsets etc
- */
-#define A64_INSTR_SIZE 4
struct cs_etm_auxtrace { struct auxtrace auxtrace; struct auxtrace_queues queues; @@ -494,40 +486,69 @@ static inline void cs_etm__reset_last_branch_rb(struct cs_etm_queue *etmq) etmq->last_branch_rb->nr = 0; }
-static inline u64 cs_etm__last_executed_instr(struct cs_etm_packet *packet) +static inline int cs_etm__t32_instr_size(struct cs_etm_queue *etmq,
u64 addr) {
u8 instrBytes[2];
cs_etm__mem_access(etmq, addr, 2, instrBytes);
return ((instrBytes[1] & 0xF8) >= 0xE8) ? 4 : 2;
Please add a comment for the 0xF8 and 0xE8 values. I understand what you're doing but other people may not.
+}
+static inline u64 cs_etm__last_executed_instr(struct cs_etm_queue +*etmq) {
const struct cs_etm_packet *packet = etmq->prev_packet;
Here the function prototype has an "etmq" while cs_etm__instr_count() and cs_etm__instr_add() are getting both the "etmq" and "packet". Any reason for the change of heart? If not please harmonise the convention of your choice.
/* * The packet records the execution range with an exclusive end
address
*
* A64 instructions are constant size, so the last executed
* instruction is A64_INSTR_SIZE before the end address
* Will need to do instruction level decode for T32 instructions as
* they can be variable size (not yet supported). */
return packet->end_addr - A64_INSTR_SIZE;
if (packet->isa == CS_ETM_ISA_T32) {
/*
* T32 instructions can be either 2 or 4 bytes
*/
if (packet->end_addr - packet->start_addr == 2)
/* Only one 2 byte instruction in packet*/
return packet->start_addr;
else if (cs_etm__t32_instr_size(etmq, packet->end_addr) == 4)
return packet->end_addr - 4;
else
return packet->end_addr - 2;
} else {
/* Assume a 4 byte instruction size (A32/A64) */
return packet->end_addr - 4;
}
}
-static inline u64 cs_etm__instr_count(const struct cs_etm_packet *packet) +static inline u64 cs_etm__instr_count(struct cs_etm_queue *etmq,
const struct cs_etm_packet
+*packet) {
/*
* Only A64 instructions are currently supported, so can get
* instruction count by dividing.
* Will need to do instruction level decode for T32 instructions as
* they can be variable size (not yet supported).
*/
return (packet->end_addr - packet->start_addr) / A64_INSTR_SIZE;
if (packet->isa == CS_ETM_ISA_T32) {
u64 addr = packet->start_addr;
u64 count = 0;
while (addr != packet->end_addr) {
addr += cs_etm__t32_instr_size(etmq, addr);
count++;
}
return count;
} else {
/* Assume a 4 byte instruction size (A32/A64) */
return (packet->end_addr - packet->start_addr) / 4;
}
}
-static inline u64 cs_etm__instr_addr(const struct cs_etm_packet *packet, +static inline u64 cs_etm__instr_addr(struct cs_etm_queue *etmq,
const struct cs_etm_packet
+*packet, u64 offset) {
/*
* Only A64 instructions are currently supported, so can get
* instruction address by muliplying.
* Will need to do instruction level decode for T32 instructions as
* they can be variable size (not yet supported).
*/
return packet->start_addr + offset * A64_INSTR_SIZE;
if (packet->isa == CS_ETM_ISA_T32) {
u64 addr = packet->start_addr;
while (offset > 0) {
addr += cs_etm__t32_instr_size(etmq, addr);
offset--;
}
return addr;
} else {
/* Assume a 4 byte instruction size (A32/A64) */
return packet->start_addr + offset * 4;
}
}
static void cs_etm__update_last_branch_rb(struct cs_etm_queue
*etmq)
@@ -547,7 +568,7 @@ static void cs_etm__update_last_branch_rb(struct
cs_etm_queue *etmq)
etmq->last_branch_pos -= 1; be = &bs->entries[etmq->last_branch_pos];
be->from = cs_etm__last_executed_instr(etmq->prev_packet);
be->from = cs_etm__last_executed_instr(etmq); be->to = etmq->packet->start_addr; /* No support for mispredict */ be->flags.mispred = 0;
@@ -700,7 +721,7 @@ static int cs_etm__synth_branch_sample(struct
cs_etm_queue *etmq)
event->sample.header.misc = PERF_RECORD_MISC_USER; event->sample.header.size = sizeof(struct perf_event_header);
sample.ip = cs_etm__last_executed_instr(etmq->prev_packet);
sample.ip = cs_etm__last_executed_instr(etmq); sample.pid = etmq->pid; sample.tid = etmq->tid; sample.addr = etmq->packet->start_addr; @@ -858,7 +879,7 @@
static int cs_etm__sample(struct cs_etm_queue *etmq) int ret; u64 instrs_executed;
instrs_executed = cs_etm__instr_count(etmq->packet);
instrs_executed = cs_etm__instr_count(etmq, etmq->packet); etmq->period_instructions += instrs_executed; /*
@@ -888,7 +909,7 @@ static int cs_etm__sample(struct cs_etm_queue
*etmq)
* executed, but PC has not advanced to next instruction) */ u64 offset = (instrs_executed - instrs_over - 1);
u64 addr = cs_etm__instr_addr(etmq->packet, offset);
u64 addr = cs_etm__instr_addr(etmq, etmq->packet,
offset);
ret = cs_etm__synth_instruction_sample( etmq, addr, etm->instructions_sample_period);
@@ -936,7 +957,7 @@ static int cs_etm__flush(struct cs_etm_queue
*etmq)
* Use the address of the end of the last reported execution * range */
u64 addr = cs_etm__last_executed_instr(etmq->prev_packet);
u64 addr = cs_etm__last_executed_instr(etmq); err = cs_etm__synth_instruction_sample( etmq, addr,
-- 2.7.4
CoreSight mailing list CoreSight@lists.linaro.org https://lists.linaro.org/mailman/listinfo/coresight
-- Mike Leach Principal Engineer, ARM Ltd. Blackburn Design Centre. UK
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.