[PATCH] raid6: arm64: port to pure assembler

26 Jun 2015

Hi all,
I had a go at replacing the intrinsics code with pure asm. There is an
obvious speedup for the 8x unrolled case, but the other ones are
inconclusive.
Anyone care to have a go at running this on something else than a
Cortex-A57 which is what I used? (It's as simple as enabling CONFIG_MD_RAID456
as a builtin and building and booting the kernel)
BTW this applies on top of
http://article.gmane.org/gmane.linux.ports.arm.kernel/422124
but only the following hunk should be required, and AFAIK it is not a
hard prerequisite (it should only result in the 'xor()' lines being
omitted)

diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c
index d9ad6ee284f4..7076ef1ba3dd 100644
--- a/lib/raid6/neon.c
+++ b/lib/raid6/neon.c
 <at>  <at>  -40,9 +40,20  <at>  <at> 
    				(unsigned long)bytes, ptrs);	\
    	kernel_neon_end();					\
    }								\
+	static void raid6_neon ## _n ## _xor_syndrome(int disks,	\
+					int start, int stop, 		\
+					size_t bytes, void **ptrs)	\
+	{								\
+		void raid6_neon ## _n  ## _xor_syndrome_real(int,	\
+				int, int, unsigned long, void**);	\
+		kernel_neon_begin();					\
+		raid6_neon ## _n ## _xor_syndrome_real(disks,		\
+			start, stop, (unsigned long)bytes, ptrs);	\
+		kernel_neon_end();					\
+	}								\
    struct raid6_calls const raid6_neonx ## _n = {			\
    	raid6_neon ## _n ## _gen_syndrome,			\
-		NULL,		/* XOR not yet implemented */		\
+		raid6_neon ## _n ## _xor_syndrome,			\
    	raid6_have_neon,					\
    	"neonx" #_n,						\
    	0							\
Cheers,
Ard.
-----------8<------------------
The NEON intrinsics implementation of the RAID-6 syndrome generation
is not as close to optimal as we would want. Since the actual algorithm
is not very heavy weight, the poor handling by GCC of vector loads
and stores is affecting overall performance.
So replace it by a pure assembler implementation:
Before:
  raid6: neonx1   gen()  1834 MB/s
  raid6: neonx1   xor()  1278 MB/s
  raid6: neonx2   gen()  2528 MB/s
  raid6: neonx2   xor()  1942 MB/s
  raid6: neonx4   gen()  2888 MB/s
  raid6: neonx4   xor()  2334 MB/s
  raid6: neonx8   gen()  2957 MB/s
  raid6: neonx8   xor()  2232 MB/s
After:
  raid6: neonx1   gen()  2458 MB/s
  raid6: neonx1   xor()  1839 MB/s
  raid6: neonx2   gen()  2874 MB/s
  raid6: neonx2   xor()  2280 MB/s
  raid6: neonx4   gen()  3431 MB/s
  raid6: neonx4   xor()  2338 MB/s
NOTE: the x4 assembler implementation maps onto the x8 intrinsics version,
since the assembler code already does a 2-way interleave before unrolling.
This doesn't allow for a x8 to be generated, since we run out of NEON
registers.
Signed-off-by: Ard Biesheuvel ard.biesheuvel@linaro.org
---
 lib/raid6/Makefile      |  25 +++++--
 lib/raid6/algos.c       |   2 +
 lib/raid6/neon-arm64.uS | 175 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/raid6/neon.c        |   2 +
 4 files changed, 197 insertions(+), 7 deletions(-)
 create mode 100644 lib/raid6/neon-arm64.uS
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index c7dab0645554..b26c48bafbb7 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -5,7 +5,9 @@ raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
-raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
+raid6_neon-$(CONFIG_ARM) += neon1.o neon2.o neon4.o neon8.o
+raid6_neon-$(CONFIG_ARM64) += neon1-arm64.o neon2-arm64.o neon4-arm64.o
+raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o $(raid6_neon-y)
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
hostprogs-y	+= mktables
@@ -25,12 +27,6 @@ NEON_FLAGS := -ffreestanding
 ifeq ($(ARCH),arm)
 NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
 endif
-ifeq ($(ARCH),arm64)
-CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only
-endif
 endif
targets += int1.c
@@ -111,6 +107,21 @@ $(obj)/neon8.c:   UNROLL := 8
 $(obj)/neon8.c:   $(src)/neon.uc $(src)/unroll.awk FORCE
    $(call if_changed,unroll)
+targets += neon1-arm64.S
+$(obj)/neon1-arm64.S: UNROLL := 1
+$(obj)/neon1-arm64.S: $(src)/neon-arm64.uS $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+targets += neon2-arm64.S
+$(obj)/neon2-arm64.S: UNROLL := 2
+$(obj)/neon2-arm64.S: $(src)/neon-arm64.uS $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
+targets += neon4-arm64.S
+$(obj)/neon4-arm64.S: UNROLL := 4
+$(obj)/neon4-arm64.S: $(src)/neon-arm64.uS $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
 targets += tilegx8.c
 $(obj)/tilegx8.c:   UNROLL := 8
 $(obj)/tilegx8.c:   $(src)/tilegx.uc $(src)/unroll.awk FORCE
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 975c6e0434bd..a5614bfc3e85 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -77,8 +77,10 @@ const struct raid6_calls * const raid6_algos[] = {
    &raid6_neonx1,
    &raid6_neonx2,
    &raid6_neonx4,
+#ifdef CONFIG_ARM
    &raid6_neonx8,
 #endif
+#endif
    NULL
 };
diff --git a/lib/raid6/neon-arm64.uS b/lib/raid6/neon-arm64.uS
new file mode 100644
index 000000000000..8873dc439281
--- /dev/null
+++ b/lib/raid6/neon-arm64.uS
@@ -0,0 +1,175 @@
+/*
+ * arm64.S - RAID-6 syndrome calculation using AArch64 NEON instructions
+ *
+ * Copyright (C) 2015 Linaro Ltd. ard.biesheuvel@linaro.org
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+	vpa0		.req	v0
+	vpb0		.req	v1
+	vqa0		.req	v2
+	vqb0		.req	v3
+	vpa1		.req	v4
+	vpb1		.req	v5
+	vqa1		.req	v6
+	vqb1		.req	v7
+	vpa2		.req	v8
+	vpb2		.req	v9
+	vqa2		.req	v10
+	vqb2		.req	v11
+	vpa3		.req	v12
+	vpb3		.req	v13
+	vqa3		.req	v14
+	vqb3		.req	v15
+	vda0		.req	v16
+	vdb0		.req	v17
+	vda1		.req	v18
+	vdb1		.req	v19
+	vda2		.req	v20
+	vdb2		.req	v21
+	vda3		.req	v22
+	vdb3		.req	v23
+	vta0		.req	v24
+	vtb0		.req	v25
+	vta1		.req	v26
+	vtb1		.req	v27
+	vta2		.req	v28
+	vtb2		.req	v29
+	vta3		.req	v30
+	vtb3		.req	v31
+
+	.macro		raid6_neon_core, inp, off
+	//
+	// Two versions of the P/Q core transform rolled into one:
+	// - the full one which takes the data input and computes both the
+	//   XOR (P) and GF(2^8) (Q) syndromes
+	// - if no input is passed, only the Q syndrome is recomputed, as is
+	//   needed for the P/Q left side optimization.
+	//
+
+	// Q: mul by x in GF(2^8) with polynomial 0x11d
+	sshr		vta$$.16b, vqa$$.16b, #7	// mask
+	.ifnb		\inp
+	add		\inp, \inp, \off		// add outer byte offset
+	.endif
+	sshr		vtb$$.16b, vqb$$.16b, #7
+	.ifnb		\inp
+	ld1		{vda$$.2d, vdb$$.2d}, [\inp], #32
+	.endif
+	shl		vqa$$.16b, vqa$$.16b, #1	// shift
+	shl		vqb$$.16b, vqb$$.16b, #1
+
+	// P: xor
+	.ifnb		\inp
+	eor		vpa$$.16b, vpa$$.16b, vda$$.16b
+	eor		vpb$$.16b, vpb$$.16b, vdb$$.16b
+	.endif
+
+	// Q: continued
+	.ifnb		\inp
+	eor		vqa$$.16b, vqa$$.16b, vda$$.16b
+	movi		vda0.16b, #0x1d
+	eor		vqb$$.16b, vqb$$.16b, vdb$$.16b
+	.endif
+	and		vta$$.16b, vta$$.16b, vda0.16b
+	and		vtb$$.16b, vtb$$.16b, vda0.16b
+	eor		vqa$$.16b, vqa$$.16b, vta$$.16b
+	eor		vqb$$.16b, vqb$$.16b, vtb$$.16b
+	.endm
+
+	/*
+	 * void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes,
+	 *				       void **ptrs)
+	 */
+ENTRY(raid6_neon$#_gen_syndrome_real)
+	add		x5, x2, x0, lsl #3
+	ldp		x6, x7, [x5, #-16]	// x6: P[], x7: Q[]
+	ldr		x8, [x5, #-24]!		// x8: datadisk(highest)[]
+	mov		x9, #0			// x9: outer byte offset
+
+.Lgen_outer:
+	mov		x10, x5
+
+	ld1		{vqa$$.2d, vqb$$.2d}, [x8], #32
+	mov		vpa$$.16b, vqa$$.16b
+	mov		vpb$$.16b, vqb$$.16b
+
+.Lgen_inner:
+	ldr		x11, [x10, #-8]!		// next datadisk
+
+	raid6_neon_core	x11, x9
+
+	cmp		x10, x2
+	bhi		.Lgen_inner
+
+	add		x9, x9, #($# * 32)		// next stripe
+	cmp		x9, x1
+
+	st1		{vpa$$.2d, vpb$$.2d}, [x6], #32
+	st1		{vqa$$.2d, vqb$$.2d}, [x7], #32
+
+	bne		.Lgen_outer
+	ret
+ENDPROC(raid6_neon$#_gen_syndrome_real)
+
+	/*
+	 * void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
+	 *				       unsigned long bytes, void **ptrs)
+	 */
+ENTRY(raid6_neon$#_xor_syndrome_real)
+	add		x5, x4, x0, lsl #3
+	ldp		x6, x7, [x5, #-16]	// x6: P[], x7: Q[]
+	add		x12, x4, x1, lsl #3	// x12: &datadisk(start)[]
+	ldr		x8, [x4, x2, lsl #3]	// x8: datadisk(stop)[]
+	mov		x9, #0			// x9: outer byte offset
+
+.Lxor_outer:
+	add		x10, x4, x2, lsl #3
+
+	ld1		{vpa$$.2d, vpb$$.2d}, [x6], #32
+	ld1		{vqa$$.2d, vqb$$.2d}, [x8], #32
+
+	eor		vpa$$.16b, vpa$$.16b, vqa$$.16b
+	eor		vpb$$.16b, vpb$$.16b, vqb$$.16b
+
+.Lxor_inner1:
+	cmp		x10, x12
+	bls		.Lxor_next_inner
+
+	ldr		x11, [x10, #-8]!		// next datadisk
+	raid6_neon_core	x11, x9
+	b		.Lxor_inner1
+
+.Lxor_next_inner:
+	subs		x10, x10, x4
+	bls		.Lxor_next
+	movi		vda0.16b, #0x1d
+
+.Lxor_inner2:
+	raid6_neon_core
+
+	subs		x10, x10, #8
+	bhi		.Lxor_inner2
+
+.Lxor_next:
+	add		x9, x9, #($# * 32)		// next stripe
+	cmp		x9, x3
+
+	ld1		{vta$$.2d, vtb$$.2d}, [x7], #32
+	sub		x6, x6, #($# * 32)
+	sub		x7, x7, #($# * 32)
+
+	eor		vqa$$.16b, vqa$$.16b, vta$$.16b
+	eor		vqb$$.16b, vqb$$.16b, vtb$$.16b
+
+	st1		{vpa$$.2d, vpb$$.2d}, [x6], #32
+	st1		{vqa$$.2d, vqb$$.2d}, [x7], #32
+
+	bne		.Lxor_outer
+	ret
+ENDPROC(raid6_neon$#_xor_syndrome_real)
diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c
index 7076ef1ba3dd..683266f68118 100644
--- a/lib/raid6/neon.c
+++ b/lib/raid6/neon.c
@@ -67,4 +67,6 @@ static int raid6_have_neon(void)
 RAID6_NEON_WRAPPER(1);
 RAID6_NEON_WRAPPER(2);
 RAID6_NEON_WRAPPER(4);
+#ifdef CONFIG_ARM
 RAID6_NEON_WRAPPER(8);
+#endif
-- 
1.9.1


    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

[PATCH] raid6: arm64: port to pure assembler