Hi all,
I had a go at replacing the intrinsics code with pure asm. There is an obvious speedup for the 8x unrolled case, but the other ones are inconclusive.
Anyone care to have a go at running this on something else than a Cortex-A57 which is what I used? (It's as simple as enabling CONFIG_MD_RAID456 as a builtin and building and booting the kernel)
BTW this applies on top of
http://article.gmane.org/gmane.linux.ports.arm.kernel/422124
but only the following hunk should be required, and AFAIK it is not a hard prerequisite (it should only result in the 'xor()' lines being omitted)
diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c index d9ad6ee284f4..7076ef1ba3dd 100644 --- a/lib/raid6/neon.c +++ b/lib/raid6/neon.c <at> <at> -40,9 +40,20 <at> <at> (unsigned long)bytes, ptrs); \ kernel_neon_end(); \ } \ + static void raid6_neon ## _n ## _xor_syndrome(int disks, \ + int start, int stop, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_neon ## _n ## _xor_syndrome_real(int, \ + int, int, unsigned long, void**); \ + kernel_neon_begin(); \ + raid6_neon ## _n ## _xor_syndrome_real(disks, \ + start, stop, (unsigned long)bytes, ptrs); \ + kernel_neon_end(); \ + } \ struct raid6_calls const raid6_neonx ## _n = { \ raid6_neon ## _n ## _gen_syndrome, \ - NULL, /* XOR not yet implemented */ \ + raid6_neon ## _n ## _xor_syndrome, \ raid6_have_neon, \ "neonx" #_n, \ 0 \
Cheers, Ard.
-----------8<------------------
The NEON intrinsics implementation of the RAID-6 syndrome generation is not as close to optimal as we would want. Since the actual algorithm is not very heavy weight, the poor handling by GCC of vector loads and stores is affecting overall performance.
So replace it by a pure assembler implementation:
Before: raid6: neonx1 gen() 1834 MB/s raid6: neonx1 xor() 1278 MB/s raid6: neonx2 gen() 2528 MB/s raid6: neonx2 xor() 1942 MB/s raid6: neonx4 gen() 2888 MB/s raid6: neonx4 xor() 2334 MB/s raid6: neonx8 gen() 2957 MB/s raid6: neonx8 xor() 2232 MB/s
After: raid6: neonx1 gen() 2458 MB/s raid6: neonx1 xor() 1839 MB/s raid6: neonx2 gen() 2874 MB/s raid6: neonx2 xor() 2280 MB/s raid6: neonx4 gen() 3431 MB/s raid6: neonx4 xor() 2338 MB/s
NOTE: the x4 assembler implementation maps onto the x8 intrinsics version, since the assembler code already does a 2-way interleave before unrolling. This doesn't allow for a x8 to be generated, since we run out of NEON registers.
Signed-off-by: Ard Biesheuvel ard.biesheuvel@linaro.org --- lib/raid6/Makefile | 25 +++++-- lib/raid6/algos.c | 2 + lib/raid6/neon-arm64.uS | 175 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/raid6/neon.c | 2 + 4 files changed, 197 insertions(+), 7 deletions(-) create mode 100644 lib/raid6/neon-arm64.uS
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index c7dab0645554..b26c48bafbb7 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -5,7 +5,9 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \
raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o -raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o +raid6_neon-$(CONFIG_ARM) += neon1.o neon2.o neon4.o neon8.o +raid6_neon-$(CONFIG_ARM64) += neon1-arm64.o neon2-arm64.o neon4-arm64.o +raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o $(raid6_neon-y) raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
hostprogs-y += mktables @@ -25,12 +27,6 @@ NEON_FLAGS := -ffreestanding ifeq ($(ARCH),arm) NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon endif -ifeq ($(ARCH),arm64) -CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only -CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only -CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only -CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only -endif endif
targets += int1.c @@ -111,6 +107,21 @@ $(obj)/neon8.c: UNROLL := 8 $(obj)/neon8.c: $(src)/neon.uc $(src)/unroll.awk FORCE $(call if_changed,unroll)
+targets += neon1-arm64.S +$(obj)/neon1-arm64.S: UNROLL := 1 +$(obj)/neon1-arm64.S: $(src)/neon-arm64.uS $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += neon2-arm64.S +$(obj)/neon2-arm64.S: UNROLL := 2 +$(obj)/neon2-arm64.S: $(src)/neon-arm64.uS $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += neon4-arm64.S +$(obj)/neon4-arm64.S: UNROLL := 4 +$(obj)/neon4-arm64.S: $(src)/neon-arm64.uS $(src)/unroll.awk FORCE + $(call if_changed,unroll) + targets += tilegx8.c $(obj)/tilegx8.c: UNROLL := 8 $(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 975c6e0434bd..a5614bfc3e85 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -77,8 +77,10 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_neonx1, &raid6_neonx2, &raid6_neonx4, +#ifdef CONFIG_ARM &raid6_neonx8, #endif +#endif NULL };
diff --git a/lib/raid6/neon-arm64.uS b/lib/raid6/neon-arm64.uS new file mode 100644 index 000000000000..8873dc439281 --- /dev/null +++ b/lib/raid6/neon-arm64.uS @@ -0,0 +1,175 @@ +/* + * arm64.S - RAID-6 syndrome calculation using AArch64 NEON instructions + * + * Copyright (C) 2015 Linaro Ltd. ard.biesheuvel@linaro.org + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + + vpa0 .req v0 + vpb0 .req v1 + vqa0 .req v2 + vqb0 .req v3 + vpa1 .req v4 + vpb1 .req v5 + vqa1 .req v6 + vqb1 .req v7 + vpa2 .req v8 + vpb2 .req v9 + vqa2 .req v10 + vqb2 .req v11 + vpa3 .req v12 + vpb3 .req v13 + vqa3 .req v14 + vqb3 .req v15 + vda0 .req v16 + vdb0 .req v17 + vda1 .req v18 + vdb1 .req v19 + vda2 .req v20 + vdb2 .req v21 + vda3 .req v22 + vdb3 .req v23 + vta0 .req v24 + vtb0 .req v25 + vta1 .req v26 + vtb1 .req v27 + vta2 .req v28 + vtb2 .req v29 + vta3 .req v30 + vtb3 .req v31 + + .macro raid6_neon_core, inp, off + // + // Two versions of the P/Q core transform rolled into one: + // - the full one which takes the data input and computes both the + // XOR (P) and GF(2^8) (Q) syndromes + // - if no input is passed, only the Q syndrome is recomputed, as is + // needed for the P/Q left side optimization. + // + + // Q: mul by x in GF(2^8) with polynomial 0x11d + sshr vta$$.16b, vqa$$.16b, #7 // mask + .ifnb \inp + add \inp, \inp, \off // add outer byte offset + .endif + sshr vtb$$.16b, vqb$$.16b, #7 + .ifnb \inp + ld1 {vda$$.2d, vdb$$.2d}, [\inp], #32 + .endif + shl vqa$$.16b, vqa$$.16b, #1 // shift + shl vqb$$.16b, vqb$$.16b, #1 + + // P: xor + .ifnb \inp + eor vpa$$.16b, vpa$$.16b, vda$$.16b + eor vpb$$.16b, vpb$$.16b, vdb$$.16b + .endif + + // Q: continued + .ifnb \inp + eor vqa$$.16b, vqa$$.16b, vda$$.16b + movi vda0.16b, #0x1d + eor vqb$$.16b, vqb$$.16b, vdb$$.16b + .endif + and vta$$.16b, vta$$.16b, vda0.16b + and vtb$$.16b, vtb$$.16b, vda0.16b + eor vqa$$.16b, vqa$$.16b, vta$$.16b + eor vqb$$.16b, vqb$$.16b, vtb$$.16b + .endm + + /* + * void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, + * void **ptrs) + */ +ENTRY(raid6_neon$#_gen_syndrome_real) + add x5, x2, x0, lsl #3 + ldp x6, x7, [x5, #-16] // x6: P[], x7: Q[] + ldr x8, [x5, #-24]! // x8: datadisk(highest)[] + mov x9, #0 // x9: outer byte offset + +.Lgen_outer: + mov x10, x5 + + ld1 {vqa$$.2d, vqb$$.2d}, [x8], #32 + mov vpa$$.16b, vqa$$.16b + mov vpb$$.16b, vqb$$.16b + +.Lgen_inner: + ldr x11, [x10, #-8]! // next datadisk + + raid6_neon_core x11, x9 + + cmp x10, x2 + bhi .Lgen_inner + + add x9, x9, #($# * 32) // next stripe + cmp x9, x1 + + st1 {vpa$$.2d, vpb$$.2d}, [x6], #32 + st1 {vqa$$.2d, vqb$$.2d}, [x7], #32 + + bne .Lgen_outer + ret +ENDPROC(raid6_neon$#_gen_syndrome_real) + + /* + * void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop, + * unsigned long bytes, void **ptrs) + */ +ENTRY(raid6_neon$#_xor_syndrome_real) + add x5, x4, x0, lsl #3 + ldp x6, x7, [x5, #-16] // x6: P[], x7: Q[] + add x12, x4, x1, lsl #3 // x12: &datadisk(start)[] + ldr x8, [x4, x2, lsl #3] // x8: datadisk(stop)[] + mov x9, #0 // x9: outer byte offset + +.Lxor_outer: + add x10, x4, x2, lsl #3 + + ld1 {vpa$$.2d, vpb$$.2d}, [x6], #32 + ld1 {vqa$$.2d, vqb$$.2d}, [x8], #32 + + eor vpa$$.16b, vpa$$.16b, vqa$$.16b + eor vpb$$.16b, vpb$$.16b, vqb$$.16b + +.Lxor_inner1: + cmp x10, x12 + bls .Lxor_next_inner + + ldr x11, [x10, #-8]! // next datadisk + raid6_neon_core x11, x9 + b .Lxor_inner1 + +.Lxor_next_inner: + subs x10, x10, x4 + bls .Lxor_next + movi vda0.16b, #0x1d + +.Lxor_inner2: + raid6_neon_core + + subs x10, x10, #8 + bhi .Lxor_inner2 + +.Lxor_next: + add x9, x9, #($# * 32) // next stripe + cmp x9, x3 + + ld1 {vta$$.2d, vtb$$.2d}, [x7], #32 + sub x6, x6, #($# * 32) + sub x7, x7, #($# * 32) + + eor vqa$$.16b, vqa$$.16b, vta$$.16b + eor vqb$$.16b, vqb$$.16b, vtb$$.16b + + st1 {vpa$$.2d, vpb$$.2d}, [x6], #32 + st1 {vqa$$.2d, vqb$$.2d}, [x7], #32 + + bne .Lxor_outer + ret +ENDPROC(raid6_neon$#_xor_syndrome_real) diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c index 7076ef1ba3dd..683266f68118 100644 --- a/lib/raid6/neon.c +++ b/lib/raid6/neon.c @@ -67,4 +67,6 @@ static int raid6_have_neon(void) RAID6_NEON_WRAPPER(1); RAID6_NEON_WRAPPER(2); RAID6_NEON_WRAPPER(4); +#ifdef CONFIG_ARM RAID6_NEON_WRAPPER(8); +#endif