On Wed, May 17, 2017 at 7:31 AM, Sebastian Pop <sebpop@gmail.com> wrote:
Hi Dehao,

On Tue, May 16, 2017 at 7:17 PM, Dehao Chen <dehao@google.com> wrote:
> Could you copy-paste the code with line number (the one you pasted does not
> match the profile).

Please see attached sort.c

> Could you also provide the the assembly of the -g1 arm
> binary so that I can examine the debug info?

# gcc -O3 -g1 sort.c -S -o sort-aarch64.s

See attached sort-aarch64.s

>
> Also, it would be helpful if you can use sample_merger to generate the text
> version of the perf.data file and attach that to the thread too

Could you please provide the full command line I should be using?
I tried the following with no meaningful output:

# sample_merger -profile inj
# cat data.txt
0
0
0

You also need to have -binary point to the profiling binary.
 

When I dump the raw trace with:
# perf report -D -i inj > dump.txt
one of the LBR events looks like this:

0x117340 [0x630]: PERF_RECORD_SAMPLE(IP, 0x2): 15163/15163: 0x40088c
period: 3 addr: 0
... branch stack: nr:64
.....  0: 0000ffffa3693ea4 -> 0000ffffa3693eac 0 cycles  P   0
.....  1: 0000ffffa3693a10 -> 0000ffffa3693a20 0 cycles  P   0
.....  2: 0000ffffa3693a08 -> 0000ffffa3693a10 0 cycles  P   0
.....  3: 0000ffffa36939f4 -> 0000ffffa3693a08 0 cycles  P   0
.....  4: 0000ffffa3693b88 -> 0000ffffa3693ba0 0 cycles  P   0
.....  5: 0000ffffa3693b58 -> 0000ffffa3693b88 0 cycles  P   0
.....  6: 0000ffffa3693b4c -> 0000ffffa3693b58 0 cycles  P   0
.....  7: 0000ffffa3693b40 -> 0000ffffa3693b4c 0 cycles  P   0
.....  8: 0000ffffa36939e4 -> 0000ffffa36939f4 0 cycles  P   0
.....  9: 0000ffffa36939e0 -> 0000ffffa36939e4 0 cycles  P   0
..... 10: 0000ffffa36939d8 -> 0000ffffa36939e0 0 cycles  P   0
..... 11: 0000ffffa36939b0 -> 0000ffffa36939d8 0 cycles  P   0
..... 12: 0000ffffa3693e98 -> 0000ffffa3693ea4 0 cycles  P   0
..... 13: 00000000004005a0 -> 00000000004005b0 0 cycles  P   0
..... 14: 0000000000400888 -> 000000000040088c 0 cycles  P   0
..... 15: 000000000040088c -> 0000000000400898 0 cycles  P   0
..... 16: 0000ffffa3693ea4 -> 0000ffffa3693eac 0 cycles  P   0
..... 17: 0000ffffa3693a10 -> 0000ffffa3693a20 0 cycles  P   0
..... 18: 0000ffffa3693a08 -> 0000ffffa3693a10 0 cycles  P   0
..... 19: 0000ffffa36939f4 -> 0000ffffa3693a08 0 cycles  P   0
..... 20: 0000ffffa3693b88 -> 0000ffffa3693ba0 0 cycles  P   0
..... 21: 0000ffffa3693b58 -> 0000ffffa3693b88 0 cycles  P   0
..... 22: 0000ffffa3693b4c -> 0000ffffa3693b58 0 cycles  P   0
..... 23: 0000ffffa3693b40 -> 0000ffffa3693b4c 0 cycles  P   0
..... 24: 0000ffffa36939e4 -> 0000ffffa36939f4 0 cycles  P   0
..... 25: 0000ffffa36939e0 -> 0000ffffa36939e4 0 cycles  P   0
..... 26: 0000ffffa36939d8 -> 0000ffffa36939e0 0 cycles  P   0
..... 27: 0000ffffa36939b0 -> 0000ffffa36939d8 0 cycles  P   0
..... 28: 0000ffffa3693e98 -> 0000ffffa3693ea4 0 cycles  P   0
..... 29: 00000000004005a0 -> 00000000004005b0 0 cycles  P   0
..... 30: 0000000000400888 -> 000000000040088c 0 cycles  P   0
..... 31: 000000000040088c -> 0000000000400898 0 cycles  P   0
..... 32: 0000ffffa3693ea4 -> 0000ffffa3693eac 0 cycles  P   0
..... 33: 0000ffffa3693a10 -> 0000ffffa3693a20 0 cycles  P   0
..... 34: 0000ffffa3693a08 -> 0000ffffa3693a10 0 cycles  P   0
..... 35: 0000ffffa36939f4 -> 0000ffffa3693a08 0 cycles  P   0
..... 36: 0000ffffa3693b88 -> 0000ffffa3693ba0 0 cycles  P   0
..... 37: 0000ffffa3693b58 -> 0000ffffa3693b88 0 cycles  P   0
..... 38: 0000ffffa3693b4c -> 0000ffffa3693b58 0 cycles  P   0
..... 39: 0000ffffa3693b40 -> 0000ffffa3693b4c 0 cycles  P   0
..... 40: 0000ffffa36939e4 -> 0000ffffa36939f4 0 cycles  P   0
..... 41: 0000ffffa36939e0 -> 0000ffffa36939e4 0 cycles  P   0
..... 42: 0000ffffa36939d8 -> 0000ffffa36939e0 0 cycles  P   0
..... 43: 0000ffffa36939b0 -> 0000ffffa36939d8 0 cycles  P   0
..... 44: 0000ffffa3693e98 -> 0000ffffa3693ea4 0 cycles  P   0
..... 45: 00000000004005a0 -> 00000000004005b0 0 cycles  P   0
..... 46: 0000000000400888 -> 000000000040088c 0 cycles  P   0
..... 47: 000000000040088c -> 0000000000400898 0 cycles  P   0
..... 48: 0000ffffa3693ea4 -> 0000ffffa3693eac 0 cycles  P   0
..... 49: 0000ffffa3693a10 -> 0000ffffa3693a20 0 cycles  P   0
..... 50: 0000ffffa3693a08 -> 0000ffffa3693a10 0 cycles  P   0
..... 51: 0000ffffa36939f4 -> 0000ffffa3693a08 0 cycles  P   0
..... 52: 0000ffffa3693b88 -> 0000ffffa3693ba0 0 cycles  P   0
..... 53: 0000ffffa3693b58 -> 0000ffffa3693b88 0 cycles  P   0
..... 54: 0000ffffa3693b4c -> 0000ffffa3693b58 0 cycles  P   0
..... 55: 0000ffffa3693b40 -> 0000ffffa3693b4c 0 cycles  P   0
..... 56: 0000ffffa36939e4 -> 0000ffffa36939f4 0 cycles  P   0
..... 57: 0000ffffa36939e0 -> 0000ffffa36939e4 0 cycles  P   0
..... 58: 0000ffffa36939d8 -> 0000ffffa36939e0 0 cycles  P   0
..... 59: 0000ffffa36939b0 -> 0000ffffa36939d8 0 cycles  P   0
..... 60: 0000ffffa3693e98 -> 0000ffffa3693ea4 0 cycles  P   0
..... 61: 00000000004005a0 -> 00000000004005b0 0 cycles  P   0
..... 62: 0000000000400888 -> 000000000040088c 0 cycles  P   0
..... 63: 000000000040088c -> 0000000000400898 0 cycles  P   0
 ... thread: sort_3k:15163
 ...... dso: /root/kim/sort_3k


> (as well as
> the full objdump of the bubble_sort function). This will help evaluate if
> profile is accurate at binary level.

# objdump -d sort_3k > sort_3k.objdump

See attached sort_3k.objdump

The addresses in the above branch stack are in sort_array,
as bubble_sort gets inlined:

0000000000400838 <sort_array>:
  400838:       d1400bff        sub     sp, sp, #0x2, lsl #12
  40083c:       90000001        adrp    x1, 400000 <_init-0x538>
  400840:       d13bc3ff        sub     sp, sp, #0xef0
  400844:       d285e302        mov     x2, #0x2f18
 // #12056
  400848:       91266021        add     x1, x1, #0x998
  40084c:       a9bd7bfd        stp     x29, x30, [sp,#-48]!
  400850:       910003fd        mov     x29, sp
  400854:       f90013f5        str     x21, [sp,#32]
  400858:       b0000095        adrp    x21, 411000 <_GLOBAL_OFFSET_TABLE_+0x28>
  40085c:       910122a3        add     x3, x21, #0x48
  400860:       9100e3a0        add     x0, x29, #0x38
  400864:       f9400064        ldr     x4, [x3]
  400868:       f9178fa4        str     x4, [x29,#12056]
  40086c:       d2800004        mov     x4, #0x0                        // #0
  400870:       a90153f3        stp     x19, x20, [sp,#16]
  400874:       8b0203b4        add     x20, x29, x2
  400878:       aa0003f3        mov     x19, x0
  40087c:       52817702        mov     w2, #0xbb8                      // #3000
  400880:       52800020        mov     w0, #0x1                        // #1
  400884:       97ffff3f        bl      400580 <__printf_chk@plt>
  400888:       97ffff46        bl      4005a0 <rand@plt>
  40088c:       b8004660        str     w0, [x19],#4
  400890:       eb14027f        cmp     x19, x20
  400894:       54ffffa1        b.ne    400888 <sort_array+0x50>
  400898:       d285e280        mov     x0, #0x2f14
 // #12052
  40089c:       8b0003a3        add     x3, x29, x0
  4008a0:       9100e3a0        add     x0, x29, #0x38
  4008a4:       52800004        mov     w4, #0x0                        // #0
  4008a8:       29400402        ldp     w2, w1, [x0]
  4008ac:       6b02003f        cmp     w1, w2
  4008b0:       5400006a        b.ge    4008bc <sort_array+0x84>
  4008b4:       52800024        mov     w4, #0x1                        // #1
  4008b8:       29000801        stp     w1, w2, [x0]
  4008bc:       91001000        add     x0, x0, #0x4
  4008c0:       eb00007f        cmp     x3, x0
  4008c4:       54ffff21        b.ne    4008a8 <sort_array+0x70>
  4008c8:       35fffec4        cbnz    w4, 4008a0 <sort_array+0x68>
  4008cc:       910122b5        add     x21, x21, #0x48
  4008d0:       f9578fa1        ldr     x1, [x29,#12056]
  4008d4:       f94002a0        ldr     x0, [x21]
  4008d8:       ca000020        eor     x0, x1, x0
  4008dc:       b50000e0        cbnz    x0, 4008f8 <sort_array+0xc0>
  4008e0:       a94153f3        ldp     x19, x20, [sp,#16]
  4008e4:       f94013f5        ldr     x21, [sp,#32]
  4008e8:       a8c37bfd        ldp     x29, x30, [sp],#48
  4008ec:       91400bff        add     sp, sp, #0x2, lsl #12
  4008f0:       913bc3ff        add     sp, sp, #0xef0
  4008f4:       d65f03c0        ret
  4008f8:       97ffff2e        bl      4005b0 <__stack_chk_fail@plt>
  4008fc:       00000000        .inst   0x00000000 ; undefined