Vectorised copy
Michael Hope
michael.hope at linaro.org
Mon Sep 5 01:32:43 UTC 2011
On Sat, Sep 3, 2011 at 4:54 AM, Ulrich Weigand
<Ulrich.Weigand at de.ibm.com> wrote:
> Michael Hope <michael.hope at linaro.org> wrote:
>
>> int *a;
>> int *b;
>> int *c;
>>
>> const int ad[320];
>> const int bd[320];
>> const int cd[320];
>>
>> void fill()
>> {
>> for (int i = 0; i < 320; i++)
>> {
>> a[i] = ad[i];
>> b[i] = bd[i];
>> c[i] = cd[i];
>> }
>> }
> [snip]
>> Can we always use the second form? What optimisation is preventing it?
>
> Without having looked into this in detail, my guess would be
> it depends on whether the compiler is able to prove that the
> memory pointed to by a, b, and c is distinct (instead of having
> a potential overlap if those are pointers into the same array).
>
> Does it help if you make a, b, and c function arguments to fill,
> and mark them restrict?
Yip, I had a go with that originally. Here's the variants:
(1) - local source, local destination:
int a[320];
int b[320];
int c[320];
const int ad[320];
const int bd[320];
const int cd[320];
void fill()
{
for (int i = 0; i < 320; i++)
{
a[i] = ad[i];
b[i] = bd[i];
c[i] = cd[i];
}
}
gives the best:
fill:
push {r4, r5, r6}
ldr r6, .L5
ldr r5, .L5+4
ldr r4, .L5+8
sub r3, r6, #1280
ldr r0, .L5+12
ldr r1, .L5+16
ldr r2, .L5+20
.L2:
vldmia r0!, {d16-d17}
vldmia r5!, {d18-d19}
vstmia r4!, {d18-d19}
vstmia r1!, {d16-d17}
vldmia r2!, {d16-d17}
vstmia r3!, {d16-d17}
cmp r3, r6
bne .L2
pop {r4, r5, r6}
bx lr
(2) - extern destination, local source with -fno-section-anchors to
make the code more readable:
extern int a[320];
extern int b[320];
extern int c[320];
const int ad[320];
const int bd[320];
const int cd[320];
void fill()
{
for (int i = 0; i < 320; i++)
{
a[i] = ad[i];
b[i] = bd[i];
c[i] = cd[i];
}
}
fill:
ldr r2, .L5
push {r4, r5, r6, r7, r8}
ldr r0, .L5+4
mov r3, r2
add r8, r2, #1280
ldr r7, .L5+8
ldr r6, .L5+12
rsb ip, r3, r0
ldr r1, .L5+16
ldr r2, .L5+20
subs r7, r7, r3
subs r6, r6, r3
.L2:
add r5, ip, r3
adds r4, r7, r3
vldmia r2!, {d16-d17}
vldmia r1!, {d18-d19}
adds r0, r6, r3
vst1.32 {q9}, [r5]
vst1.32 {q8}, [r4]
vldmia r3, {d16-d17}
adds r3, r3, #16
cmp r3, r8
vst1.32 {q8}, [r0]
bne .L2
pop {r4, r5, r6, r7, r8}
bx lr
(3) destination as arguments, restrict:
void fill3(int * __restrict a, int * __restrict b, int * __restrict c)
{
for (int i = 0; i < 320; i++)
{
a[i] = ad[i];
b[i] = bd[i];
c[i] = cd[i];
}
}
fill3:
push {r4, r5, r6, r7, r8}
ldr r6, .L23
ldr r5, .L23+4
ldr r4, .L23+8
mov r3, r6
subs r0, r0, r3
add r6, r6, #1280
subs r1, r1, r3
subs r2, r2, r3
.L21:
add r8, r3, r0
add ip, r3, r1
vldmia r4!, {d16-d17}
vldmia r5!, {d18-d19}
adds r7, r3, r2
vst1.32 {q9}, [r8]
vst1.32 {q8}, [ip]
vldmia r3, {d16-d17}
adds r3, r3, #16
cmp r3, r6
vst1.32 {q8}, [r7]
bne .L21
pop {r4, r5, r6, r7, r8}
bx lr
(4) destination as aligned structs:
struct blob
{
int v[320];
} __attribute__((aligned(128)));
void fill(struct blob * __restrict a, struct blob * __restrict b,
struct blob * __restrict c)
{
for (int i = 0; i < 320; i++)
{
a->v[i] = ad[i];
b->v[i] = bd[i];
c->v[i] = cd[i];
}
}
fill:
push {r4, r5, r6}
add r6, r2, #1280
ldr r3, .L5
ldr r4, .L5+4
ldr r5, .L5+8
.L2:
vldmia r3!, {d16-d17}
vstmia r0!, {d16-d17}
vldmia r4!, {d16-d17}
vstmia r1!, {d16-d17}
vldmia r5!, {d16-d17}
vstmia r2!, {d16-d17}
cmp r2, r6
bne .L2
pop {r4, r5, r6}
bx lr
Version (3) seems to rejigger the destination pointers. I assume this
is as a side effect to not knowing if the target is aligned?
-- Michael
More information about the linaro-toolchain
mailing list