Hi Dehao,
On Tue, May 16, 2017 at 7:17 PM, Dehao Chen <dehao@google.com> wrote:
> Could you copy-paste the code with line number (the one you pasted does not
> match the profile).
Please see attached sort.c
> Could you also provide the the assembly of the -g1 arm
> binary so that I can examine the debug info?
# gcc -O3 -g1 sort.c -S -o sort-aarch64.s
See attached sort-aarch64.s
>
> Also, it would be helpful if you can use sample_merger to generate the text
> version of the perf.data file and attach that to the thread too
Could you please provide the full command line I should be using?
I tried the following with no meaningful output:
# sample_merger -profile inj
# cat data.txt
0
0
0
When I dump the raw trace with:
# perf report -D -i inj > dump.txt
one of the LBR events looks like this:
0x117340 [0x630]: PERF_RECORD_SAMPLE(IP, 0x2): 15163/15163: 0x40088c
period: 3 addr: 0
... branch stack: nr:64
..... 0: 0000ffffa3693ea4 -> 0000ffffa3693eac 0 cycles P 0
..... 1: 0000ffffa3693a10 -> 0000ffffa3693a20 0 cycles P 0
..... 2: 0000ffffa3693a08 -> 0000ffffa3693a10 0 cycles P 0
..... 3: 0000ffffa36939f4 -> 0000ffffa3693a08 0 cycles P 0
..... 4: 0000ffffa3693b88 -> 0000ffffa3693ba0 0 cycles P 0
..... 5: 0000ffffa3693b58 -> 0000ffffa3693b88 0 cycles P 0
..... 6: 0000ffffa3693b4c -> 0000ffffa3693b58 0 cycles P 0
..... 7: 0000ffffa3693b40 -> 0000ffffa3693b4c 0 cycles P 0
..... 8: 0000ffffa36939e4 -> 0000ffffa36939f4 0 cycles P 0
..... 9: 0000ffffa36939e0 -> 0000ffffa36939e4 0 cycles P 0
..... 10: 0000ffffa36939d8 -> 0000ffffa36939e0 0 cycles P 0
..... 11: 0000ffffa36939b0 -> 0000ffffa36939d8 0 cycles P 0
..... 12: 0000ffffa3693e98 -> 0000ffffa3693ea4 0 cycles P 0
..... 13: 00000000004005a0 -> 00000000004005b0 0 cycles P 0
..... 14: 0000000000400888 -> 000000000040088c 0 cycles P 0
..... 15: 000000000040088c -> 0000000000400898 0 cycles P 0
..... 16: 0000ffffa3693ea4 -> 0000ffffa3693eac 0 cycles P 0
..... 17: 0000ffffa3693a10 -> 0000ffffa3693a20 0 cycles P 0
..... 18: 0000ffffa3693a08 -> 0000ffffa3693a10 0 cycles P 0
..... 19: 0000ffffa36939f4 -> 0000ffffa3693a08 0 cycles P 0
..... 20: 0000ffffa3693b88 -> 0000ffffa3693ba0 0 cycles P 0
..... 21: 0000ffffa3693b58 -> 0000ffffa3693b88 0 cycles P 0
..... 22: 0000ffffa3693b4c -> 0000ffffa3693b58 0 cycles P 0
..... 23: 0000ffffa3693b40 -> 0000ffffa3693b4c 0 cycles P 0
..... 24: 0000ffffa36939e4 -> 0000ffffa36939f4 0 cycles P 0
..... 25: 0000ffffa36939e0 -> 0000ffffa36939e4 0 cycles P 0
..... 26: 0000ffffa36939d8 -> 0000ffffa36939e0 0 cycles P 0
..... 27: 0000ffffa36939b0 -> 0000ffffa36939d8 0 cycles P 0
..... 28: 0000ffffa3693e98 -> 0000ffffa3693ea4 0 cycles P 0
..... 29: 00000000004005a0 -> 00000000004005b0 0 cycles P 0
..... 30: 0000000000400888 -> 000000000040088c 0 cycles P 0
..... 31: 000000000040088c -> 0000000000400898 0 cycles P 0
..... 32: 0000ffffa3693ea4 -> 0000ffffa3693eac 0 cycles P 0
..... 33: 0000ffffa3693a10 -> 0000ffffa3693a20 0 cycles P 0
..... 34: 0000ffffa3693a08 -> 0000ffffa3693a10 0 cycles P 0
..... 35: 0000ffffa36939f4 -> 0000ffffa3693a08 0 cycles P 0
..... 36: 0000ffffa3693b88 -> 0000ffffa3693ba0 0 cycles P 0
..... 37: 0000ffffa3693b58 -> 0000ffffa3693b88 0 cycles P 0
..... 38: 0000ffffa3693b4c -> 0000ffffa3693b58 0 cycles P 0
..... 39: 0000ffffa3693b40 -> 0000ffffa3693b4c 0 cycles P 0
..... 40: 0000ffffa36939e4 -> 0000ffffa36939f4 0 cycles P 0
..... 41: 0000ffffa36939e0 -> 0000ffffa36939e4 0 cycles P 0
..... 42: 0000ffffa36939d8 -> 0000ffffa36939e0 0 cycles P 0
..... 43: 0000ffffa36939b0 -> 0000ffffa36939d8 0 cycles P 0
..... 44: 0000ffffa3693e98 -> 0000ffffa3693ea4 0 cycles P 0
..... 45: 00000000004005a0 -> 00000000004005b0 0 cycles P 0
..... 46: 0000000000400888 -> 000000000040088c 0 cycles P 0
..... 47: 000000000040088c -> 0000000000400898 0 cycles P 0
..... 48: 0000ffffa3693ea4 -> 0000ffffa3693eac 0 cycles P 0
..... 49: 0000ffffa3693a10 -> 0000ffffa3693a20 0 cycles P 0
..... 50: 0000ffffa3693a08 -> 0000ffffa3693a10 0 cycles P 0
..... 51: 0000ffffa36939f4 -> 0000ffffa3693a08 0 cycles P 0
..... 52: 0000ffffa3693b88 -> 0000ffffa3693ba0 0 cycles P 0
..... 53: 0000ffffa3693b58 -> 0000ffffa3693b88 0 cycles P 0
..... 54: 0000ffffa3693b4c -> 0000ffffa3693b58 0 cycles P 0
..... 55: 0000ffffa3693b40 -> 0000ffffa3693b4c 0 cycles P 0
..... 56: 0000ffffa36939e4 -> 0000ffffa36939f4 0 cycles P 0
..... 57: 0000ffffa36939e0 -> 0000ffffa36939e4 0 cycles P 0
..... 58: 0000ffffa36939d8 -> 0000ffffa36939e0 0 cycles P 0
..... 59: 0000ffffa36939b0 -> 0000ffffa36939d8 0 cycles P 0
..... 60: 0000ffffa3693e98 -> 0000ffffa3693ea4 0 cycles P 0
..... 61: 00000000004005a0 -> 00000000004005b0 0 cycles P 0
..... 62: 0000000000400888 -> 000000000040088c 0 cycles P 0
..... 63: 000000000040088c -> 0000000000400898 0 cycles P 0
... thread: sort_3k:15163
...... dso: /root/kim/sort_3k
> (as well as
> the full objdump of the bubble_sort function). This will help evaluate if
> profile is accurate at binary level.
# objdump -d sort_3k > sort_3k.objdump
See attached sort_3k.objdump
The addresses in the above branch stack are in sort_array,
as bubble_sort gets inlined:
0000000000400838 <sort_array>:
400838: d1400bff sub sp, sp, #0x2, lsl #12
40083c: 90000001 adrp x1, 400000 <_init-0x538>
400840: d13bc3ff sub sp, sp, #0xef0
400844: d285e302 mov x2, #0x2f18
// #12056
400848: 91266021 add x1, x1, #0x998
40084c: a9bd7bfd stp x29, x30, [sp,#-48]!
400850: 910003fd mov x29, sp
400854: f90013f5 str x21, [sp,#32]
400858: b0000095 adrp x21, 411000 <_GLOBAL_OFFSET_TABLE_+0x28>
40085c: 910122a3 add x3, x21, #0x48
400860: 9100e3a0 add x0, x29, #0x38
400864: f9400064 ldr x4, [x3]
400868: f9178fa4 str x4, [x29,#12056]
40086c: d2800004 mov x4, #0x0 // #0
400870: a90153f3 stp x19, x20, [sp,#16]
400874: 8b0203b4 add x20, x29, x2
400878: aa0003f3 mov x19, x0
40087c: 52817702 mov w2, #0xbb8 // #3000
400880: 52800020 mov w0, #0x1 // #1
400884: 97ffff3f bl 400580 <__printf_chk@plt>
400888: 97ffff46 bl 4005a0 <rand@plt>
40088c: b8004660 str w0, [x19],#4
400890: eb14027f cmp x19, x20
400894: 54ffffa1 b.ne 400888 <sort_array+0x50>
400898: d285e280 mov x0, #0x2f14
// #12052
40089c: 8b0003a3 add x3, x29, x0
4008a0: 9100e3a0 add x0, x29, #0x38
4008a4: 52800004 mov w4, #0x0 // #0
4008a8: 29400402 ldp w2, w1, [x0]
4008ac: 6b02003f cmp w1, w2
4008b0: 5400006a b.ge 4008bc <sort_array+0x84>
4008b4: 52800024 mov w4, #0x1 // #1
4008b8: 29000801 stp w1, w2, [x0]
4008bc: 91001000 add x0, x0, #0x4
4008c0: eb00007f cmp x3, x0
4008c4: 54ffff21 b.ne 4008a8 <sort_array+0x70>
4008c8: 35fffec4 cbnz w4, 4008a0 <sort_array+0x68>
4008cc: 910122b5 add x21, x21, #0x48
4008d0: f9578fa1 ldr x1, [x29,#12056]
4008d4: f94002a0 ldr x0, [x21]
4008d8: ca000020 eor x0, x1, x0
4008dc: b50000e0 cbnz x0, 4008f8 <sort_array+0xc0>
4008e0: a94153f3 ldp x19, x20, [sp,#16]
4008e4: f94013f5 ldr x21, [sp,#32]
4008e8: a8c37bfd ldp x29, x30, [sp],#48
4008ec: 91400bff add sp, sp, #0x2, lsl #12
4008f0: 913bc3ff add sp, sp, #0xef0
4008f4: d65f03c0 ret
4008f8: 97ffff2e bl 4005b0 <__stack_chk_fail@plt>
4008fc: 00000000 .inst 0x00000000 ; undefined