Vectorised copy

Michael Hope michael.hope at linaro.org
Mon Sep 5 01:32:43 UTC 2011


On Sat, Sep 3, 2011 at 4:54 AM, Ulrich Weigand
<Ulrich.Weigand at de.ibm.com> wrote:
> Michael Hope <michael.hope at linaro.org> wrote:
>
>> int *a;
>> int *b;
>> int *c;
>>
>> const int ad[320];
>> const int bd[320];
>> const int cd[320];
>>
>> void fill()
>> {
>>   for (int i = 0; i < 320; i++)
>>     {
>>       a[i] = ad[i];
>>       b[i] = bd[i];
>>       c[i] = cd[i];
>>     }
>> }
> [snip]
>> Can we always use the second form?  What optimisation is preventing it?
>
> Without having looked into this in detail, my guess would be
> it depends on whether the compiler is able to prove that the
> memory pointed to by a, b, and c is distinct (instead of having
> a potential overlap if those are pointers into the same array).
>
> Does it help if you make a, b, and c function arguments to fill,
> and mark them restrict?

Yip, I had a go with that originally.  Here's the variants:

(1) - local source, local destination:

int a[320];
int b[320];
int c[320];

const int ad[320];
const int bd[320];
const int cd[320];

void fill()
{
  for (int i = 0; i < 320; i++)
    {
      a[i] = ad[i];
      b[i] = bd[i];
      c[i] = cd[i];
    }
}

gives the best:

fill:
	push	{r4, r5, r6}
	ldr	r6, .L5
	ldr	r5, .L5+4
	ldr	r4, .L5+8
	sub	r3, r6, #1280
	ldr	r0, .L5+12
	ldr	r1, .L5+16
	ldr	r2, .L5+20
.L2:
	vldmia	r0!, {d16-d17}
	vldmia	r5!, {d18-d19}
	vstmia	r4!, {d18-d19}
	vstmia	r1!, {d16-d17}
	vldmia	r2!, {d16-d17}
	vstmia	r3!, {d16-d17}
	cmp	r3, r6
	bne	.L2
	pop	{r4, r5, r6}
	bx	lr

(2) - extern destination, local source with -fno-section-anchors to
make the code more readable:

extern int a[320];
extern int b[320];
extern int c[320];

const int ad[320];
const int bd[320];
const int cd[320];

void fill()
{
  for (int i = 0; i < 320; i++)
    {
      a[i] = ad[i];
      b[i] = bd[i];
      c[i] = cd[i];
    }
}

fill:
	ldr	r2, .L5
	push	{r4, r5, r6, r7, r8}
	ldr	r0, .L5+4
	mov	r3, r2
	add	r8, r2, #1280
	ldr	r7, .L5+8
	ldr	r6, .L5+12
	rsb	ip, r3, r0
	ldr	r1, .L5+16
	ldr	r2, .L5+20
	subs	r7, r7, r3
	subs	r6, r6, r3
.L2:
	add	r5, ip, r3
	adds	r4, r7, r3
	vldmia	r2!, {d16-d17}
	vldmia	r1!, {d18-d19}
	adds	r0, r6, r3
	vst1.32	{q9}, [r5]
	vst1.32	{q8}, [r4]
	vldmia	r3, {d16-d17}
	adds	r3, r3, #16
	cmp	r3, r8
	vst1.32	{q8}, [r0]
	bne	.L2
	pop	{r4, r5, r6, r7, r8}
	bx	lr

(3) destination as arguments, restrict:

void fill3(int * __restrict a, int * __restrict b, int * __restrict c)
{
  for (int i = 0; i < 320; i++)
    {
      a[i] = ad[i];
      b[i] = bd[i];
      c[i] = cd[i];
    }
}

fill3:
	push	{r4, r5, r6, r7, r8}
	ldr	r6, .L23
	ldr	r5, .L23+4
	ldr	r4, .L23+8
	mov	r3, r6
	subs	r0, r0, r3
	add	r6, r6, #1280
	subs	r1, r1, r3
	subs	r2, r2, r3
.L21:
	add	r8, r3, r0
	add	ip, r3, r1
	vldmia	r4!, {d16-d17}
	vldmia	r5!, {d18-d19}
	adds	r7, r3, r2
	vst1.32	{q9}, [r8]
	vst1.32	{q8}, [ip]
	vldmia	r3, {d16-d17}
	adds	r3, r3, #16
	cmp	r3, r6
	vst1.32	{q8}, [r7]
	bne	.L21
	pop	{r4, r5, r6, r7, r8}
	bx	lr

(4) destination as aligned structs:

struct blob
{
  int v[320];
} __attribute__((aligned(128)));

void fill(struct blob * __restrict a, struct blob * __restrict b,
struct blob * __restrict c)
{
  for (int i = 0; i < 320; i++)
    {
      a->v[i] = ad[i];
      b->v[i] = bd[i];
      c->v[i] = cd[i];
    }
}

fill:
	push	{r4, r5, r6}
	add	r6, r2, #1280
	ldr	r3, .L5
	ldr	r4, .L5+4
	ldr	r5, .L5+8
.L2:
	vldmia	r3!, {d16-d17}
	vstmia	r0!, {d16-d17}
	vldmia	r4!, {d16-d17}
	vstmia	r1!, {d16-d17}
	vldmia	r5!, {d16-d17}
	vstmia	r2!, {d16-d17}
	cmp	r2, r6
	bne	.L2
	pop	{r4, r5, r6}
	bx	lr

Version (3) seems to rejigger the destination pointers.  I assume this
is as a side effect to not knowing if the target is aligned?

-- Michael



More information about the linaro-toolchain mailing list