Hi Ard!
As we've discussed today in IRC I'm sending you my asm based implementation for RAID syndrome functions. I would be glad if you can compare this implementation to intrinsics based one you are currently working on.
I don't post here my code for VFP/NEON context save/restore. People who are interested may find patches developed by Ard on [1]. However, I'm using "fpu" notation in these patches. Therefore, some changes to vfp/neon might be necessary to make things working.
[1] https://patchwork.kernel.org/patch/2605041/
Thanks! Vladimir Murzin
Signed-off-by: Vladimir Murzin murzin.v@gmail.com.com --- include/linux/raid/pq.h | 7 + lib/raid6/Makefile | 5 + lib/raid6/algos.c | 9 + lib/raid6/neon.S | 540 +++++++++++++++++++++++++++++++++++++++++++++++ lib/raid6/neon.h | 62 ++++++ lib/raid6/neon_glue.c | 135 ++++++++++++ 6 files changed, 758 insertions(+) create mode 100644 lib/raid6/neon.S create mode 100644 lib/raid6/neon.h create mode 100644 lib/raid6/neon_glue.c
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 8dfaa2c..df157a3 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -101,6 +101,13 @@ extern const struct raid6_calls raid6_altivec8; extern const struct raid6_calls raid6_avx2x1; extern const struct raid6_calls raid6_avx2x2; extern const struct raid6_calls raid6_avx2x4; +extern const struct raid6_calls raid6_neon64x1; +extern const struct raid6_calls raid6_neon64x2; +extern const struct raid6_calls raid6_neon64x4; +extern const struct raid6_calls raid6_neon64x8; +extern const struct raid6_calls raid6_neon128x1; +extern const struct raid6_calls raid6_neon128x2; +extern const struct raid6_calls raid6_neon128x4;
struct raid6_recov_calls { void (*data2)(int, size_t, int, int, void **); diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 9f7c184..f73a60e 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -5,6 +5,7 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \
raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o +raid6_pq-$(CONFIG_NEON) += neon_glue.o neon.o
hostprogs-y += mktables
@@ -16,6 +17,10 @@ ifeq ($(CONFIG_ALTIVEC),y) altivec_flags := -maltivec -mabi=altivec endif
+ifeq ($(CONFIG_NEON),y) +AFLAGS_neon.o := -mfloat-abi=softfp +endif + targets += int1.c $(obj)/int1.c: UNROLL := 1 $(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 6d7316f..f0313c5 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -66,6 +66,15 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_altivec4, &raid6_altivec8, #endif +#ifdef CONFIG_NEON + &raid6_neon64x1, + &raid6_neon64x2, + &raid6_neon64x4, + &raid6_neon64x8, + &raid6_neon128x1, + &raid6_neon128x2, + &raid6_neon128x4, +#endif &raid6_intx1, &raid6_intx2, &raid6_intx4, diff --git a/lib/raid6/neon.S b/lib/raid6/neon.S new file mode 100644 index 0000000..40460f0 --- /dev/null +++ b/lib/raid6/neon.S @@ -0,0 +1,540 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (c) 2013 Vladimir Murzin. + * + * Based on mmx.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/neon.S + * + * NEON implementation of RAID-6 syndrome functions + * + * NEON instructions operate on wide 64-bit and 128-bit vector registers, + * Despite support 128-bit vectors some cores execute with 64 bits at a time. + * Because of that 64-bit wide operations implemented too. + */ + +#include <linux/linkage.h> + +.data +.align 4 +raid6_neon_constants: +.word 0x1d1d1d1d, 0x1d1d1d1d, 0x1d1d1d1d, 0x1d1d1d1d + +.text +.align 2 +.arm +.fpu neon + +ENTRY(raid6_neon64x1_gen_syndrome_real) + push {r4, r5, r6, r7, r8, r9, r10, r11} + ldr r7, =raid6_neon_constants + vld1.64 {d0}, [r7] + veor d5, d5, d5 + sub r7, r0, #1 + ldr r3, [r2, r7, lsl #2] + sub r7, r7, #1 + ldr r4, [r2, r7, lsl #2] + cmp r1, #0 + beq d_end_64x1 + mov r6, #0 + sub r7, r7, #1 + add r8, r2, r7, lsl #2 + sub r7, r7, #1 + cmp r7, #0 + blt z_end_64x1 + ldr r10, [r8] +d_begin_64x1: + vld1.64 {d2}, [r10]! + vorr d4, d2, d2 + sub r9, r8, #4 + mov r5, r7 +z_begin_64x1: + ldr r11, [r9] + add r11, r11, r6 + vld1.64 {d6}, [r11] + vcgt.s8 d5, d5, d4 + vadd.i8 d4, d4, d4 + vand d5, d5, d0 + veor d4, d6, d4 + veor d2, d6, d2 + veor d4, d5, d4 + veor d5, d5, d5 + sub r9, r9, #4 + sub r5, r5, #1 + cmn r5, #1 + bne z_begin_64x1 +z_end_64x1: + add r5, r3, r6 + vst1.64 {d4}, [r5] + add r5, r4, r6 + vst1.64 {d2}, [r5] + add r6, r6, #8 + cmp r6, r1 + bcc d_begin_64x1 +d_end_64x1: + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +ENDPROC(raid6_neon64x1_gen_syndrome_real) + +ENTRY(raid6_neon64x2_gen_syndrome_real) + push {r4, r5, r6, r7, r8, r9, r10, r11} + ldr r7, =raid6_neon_constants + vld1.64 {d0}, [r7] + veor d7, d7, d7 + veor d6, d6, d6 + sub r7, r0, #1 + ldr r3, [r2, r7, lsl #2] + sub r7, r7, #1 + ldr r4, [r2, r7, lsl #2] + cmp r1, #0 + beq d_end_64x2 + mov r6, #0 + sub r7, r7, #1 + add r8, r2, r7, lsl #2 + sub r7, r7, #1 + cmp r7, #0 + blt z_end_64x2 + ldr r10, [r8] +d_begin_64x2: + vld1.64 {d2-d3}, [r10]! + sub r9, r8, #4 + mov r5, r7 + vorr d4, d2, d2 + vorr d5, d3, d3 +z_begin_64x2: + ldr r11, [r9] + add r11, r11, r6 + vld1.64 {d8-d9}, [r11] + vcgt.s8 d7, d7, d4 + vcgt.s8 d6, d6, d5 + vadd.i8 d4, d4, d4 + vadd.i8 d5, d5, d5 + vand d7, d7, d0 + vand d6, d6, d0 + veor d2, d8, d2 + veor d3, d9, d3 + veor d4, d8, d4 + veor d5, d9, d5 + veor d4, d7, d4 + veor d5, d6, d5 + veor d7, d7, d7 + veor d6, d6, d6 + sub r9, r9, #4 + sub r5, r5, #1 + cmn r5, #1 + bne z_begin_64x2 +z_end_64x2: + add r5, r3, r6 + vst1.64 {d4-d5}, [r5] + add r5, r4, r6 + vst1.64 {d2-d3}, [r5] + add r6, r6, #16 + cmp r6, r1 + bcc d_begin_64x2 +d_end_64x2: + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +ENDPROC(raid6_neon64x2_gen_syndrome_real) + +ENTRY(raid6_neon64x4_gen_syndrome_real) + push {r4, r5, r6, r7, r8, r9, r10, r11} + ldr r7, =raid6_neon_constants + vld1.64 {d0}, [r7] + veor d13, d13, d13 + veor d14, d14, d14 + veor d15, d15, d15 + veor d16, d16, d16 + sub r7, r0, #1 + ldr r3, [r2, r7, lsl #2] + sub r7, r7, #1 + ldr r4, [r2, r7, lsl #2] + cmp r1, #0 + beq d_end_64x4 + mov r6, #0 + sub r7, r7, #1 + add r8, r2, r7, lsl #2 + sub r7, r7, #1 + cmp r7, #0 + blt z_end_64x4 + ldr r10, [r8] +d_begin_64x4: + vld1.64 {d1-d4}, [r10]! + sub r9, r8, #4 + mov r5, r7 + vorr d5, d1, d1 + vorr d6, d2, d2 + vorr d7, d3, d3 + vorr d8, d4, d4 +z_begin_64x4: + ldr r11, [r9] + add r11, r11, r6 + vld1.64 {d9-d12}, [r11] + vcgt.s8 d13, d13, d5 + vcgt.s8 d14, d14, d6 + vcgt.s8 d15, d15, d7 + vcgt.s8 d16, d16, d8 + vadd.i8 d5, d5, d5 + vadd.i8 d6, d6, d6 + vadd.i8 d7, d7, d7 + vadd.i8 d8, d8, d8 + vand d13, d13, d0 + vand d14, d14, d0 + vand d15, d15, d0 + vand d16, d16, d0 + veor d1, d9, d1 + veor d2, d10, d2 + veor d3, d11, d3 + veor d4, d12, d4 + veor d5, d9, d5 + veor d6, d10, d6 + veor d7, d11, d7 + veor d8, d12, d8 + veor d5, d13, d5 + veor d6, d14, d6 + veor d7, d15, d7 + veor d8, d16, d8 + veor d13, d13, d13 + veor d14, d14, d14 + veor d15, d15, d15 + veor d16, d16, d16 + sub r9, r9, #4 + sub r5, r5, #1 + cmn r5, #1 + bne z_begin_64x4 +z_end_64x4: + add r5, r3, r6 + vst1.64 {d5-d8}, [r5] + add r5, r4, r6 + vst1.64 {d1-d4}, [r5] + add r6, r6, #32 + cmp r6, r1 + bcc d_begin_64x4 +d_end_64x4: + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +ENDPROC(raid6_neon64x4_gen_syndrome_real) + +ENTRY(raid6_neon64x8_gen_syndrome_real) + push {r4, r5, r6, r7, r8, r9, r10, r11} + ldr r7, =raid6_neon_constants + vld1.64 {d0}, [r7] + veor d25, d25, d25 + veor d26, d26, d26 + veor d27, d27, d27 + veor d28, d28, d28 + veor d29, d29, d29 + veor d30, d30, d30 + veor d31, d31, d31 + sub r7, r0, #1 + ldr r3, [r2, r7, lsl #2] + sub r7, r7, #1 + ldr r4, [r2, r7, lsl #2] + cmp r1, #0 + beq d_end_64x8 + mov r6, #0 + sub r7, r7, #1 + add r8, r2, r7, lsl #2 + sub r7, r7, #1 + cmp r7, #0 + blt z_end_64x8 + ldr r10, [r8] +d_begin_64x8: + vld1.64 {d1-d4}, [r10]! + vorr d9, d1, d1 + vorr d10, d2, d2 + vorr d11, d3, d3 + vorr d12, d4, d4 + vld1.64 {d5-d8}, [r10]! + vorr d13, d5, d5 + vorr d14, d6, d6 + vorr d15, d7, d7 + vorr d16, d8, d8 + sub r9, r8, #4 + mov r5, r7 +z_begin_64x8: + ldr r11, [r9] + add r11, r11, r6 + vld1.64 {d17-d20}, [r11]! + vcgt.s8 d25, d25, d10 + vcgt.s8 d26, d26, d11 + vcgt.s8 d27, d27, d12 + vld1.64 {d21-d24}, [r11] + vcgt.s8 d28, d28, d13 + vcgt.s8 d29, d29, d14 + vcgt.s8 d30, d30, d15 + vcgt.s8 d31, d31, d16 + vadd.i8 d10, d10, d10 + vadd.i8 d11, d11, d11 + vadd.i8 d12, d12, d12 + vadd.i8 d13, d13, d13 + vadd.i8 d14, d14, d14 + vadd.i8 d15, d15, d15 + vadd.i8 d16, d16, d16 + vand d25, d25, d0 + vand d26, d26, d0 + vand d27, d27, d0 + vand d28, d28, d0 + vand d29, d29, d0 + vand d30, d30, d0 + vand d31, d31, d0 + veor d1, d17, d1 + veor d2, d18, d2 + veor d3, d19, d3 + veor d4, d20, d4 + veor d5, d21, d5 + veor d6, d22, d6 + veor d7, d23, d7 + veor d8, d24, d8 + veor d10, d18, d10 + veor d11, d19, d11 + veor d12, d20, d12 + veor d13, d21, d13 + veor d14, d22, d14 + veor d15, d23, d15 + veor d16, d24, d16 + veor d10, d25, d10 + veor d11, d26, d11 + veor d12, d27, d12 + veor d13, d28, d13 + veor d14, d29, d14 + veor d15, d30, d15 + veor d16, d31, d16 + veor d25, d25, d25 + veor d26, d26, d26 + veor d27, d27, d27 + veor d28, d28, d28 + veor d29, d29, d29 + veor d30, d30, d30 + veor d31, d31, d31 + vcgt.s8 d31, d31, d9 + vadd.i8 d9, d9, d9 + vand d31, d31, d0 + veor d9, d17, d9 + veor d9, d31, d9 + veor d31, d31, d31 + sub r9, r9, #4 + sub r5, r5, #1 + cmn r5, #1 + bne z_begin_64x8 +z_end_64x8: + add r5, r3, r6 + vst1.64 {d9-d12}, [r5]! + vst1.64 {d13-d16}, [r5] + add r5, r4, r6 + vst1.64 {d1-d4}, [r5]! + vst1.64 {d5-d8}, [r5] + add r6, r6, #64 + cmp r6, r1 + bcc d_begin_64x8 +d_end_64x8: + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +ENDPROC(raid6_neon64x8_gen_syndrome_real) + +ENTRY(raid6_neon128x1_gen_syndrome_real) + push {r4, r5, r6, r7, r8, r9, r10, r11} + ldr r7, =raid6_neon_constants + vld1.64 {q0}, [r7] + veor q5, q5, q5 + sub r7, r0, #1 + ldr r3, [r2, r7, lsl #2] + sub r7, r7, #1 + ldr r4, [r2, r7, lsl #2] + cmp r1, #0 + beq d_end_128x1 + mov r6, #0 + sub r7, r7, #1 + add r8, r2, r7, lsl #2 + sub r7, r7, #1 + cmp r7, #0 + blt z_end_128x1 + ldr r10, [r8] +d_begin_128x1: + vld1.64 {q2}, [r10]! + sub r9, r8, #4 + mov r5, r7 + vorr q4, q2, q2 +z_begin_128x1: + ldr r11, [r9] + add r11, r11, r6 + vld1.64 {q6}, [r11] + vcgt.s8 q5, q5, q4 + vadd.i8 q4, q4, q4 + vand q5, q5, q0 + veor q2, q6, q2 + veor q4, q6, q4 + veor q4, q5, q4 + veor q5, q5, q5 + sub r9, r9, #4 + sub r5, r5, #1 + cmn r5, #1 + bne z_begin_128x1 +z_end_128x1: + add r5, r3, r6 + vst1.64 {q4}, [r5] + add r5, r4, r6 + veor q4, q4, q4 + vst1.64 {q2}, [r5] + add r6, r6, #16 + veor q2, q2, q2 + cmp r6, r1 + bcc d_begin_128x1 +d_end_128x1: + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +ENDPROC(raid6_neon128x1_gen_syndrome_real) + +ENTRY(raid6_neon128x2_gen_syndrome_real) + push {r4, r5, r6, r7, r8, r9, r10, r11} + ldr r7, =raid6_neon_constants + vld1.64 {q0}, [r7] + veor q6, q6, q6 + veor q7, q7, q7 + sub r7, r0, #1 + ldr r3, [r2, r7, lsl #2] + sub r7, r7, #1 + ldr r4, [r2, r7, lsl #2] + cmp r1, #0 + beq d_end_128x2 + mov r6, #0 + sub r7, r7, #1 + add r8, r2, r7, lsl #2 + sub r7, r7, #1 + cmp r7, #0 + blt z_end_128x2 + ldr r10, [r8] +d_begin_128x2: + vld1.64 {q2-q3}, [r10]! + vorr q4, q2, q2 + vorr q5, q3, q3 + sub r9, r8, #4 + mov r5, r7 +z_begin_128x2: + ldr r11, [r9] + add r11, r11, r6 + vld1.64 {q8-q9}, [r11] + vcgt.s8 q7, q7, q4 + vcgt.s8 q6, q6, q5 + vadd.i8 q4, q4, q4 + vadd.i8 q5, q5, q5 + vand q7, q7, q0 + vand q6, q6, q0 + veor q2, q8, q2 + veor q3, q9, q3 + veor q4, q8, q4 + veor q5, q9, q5 + veor q4, q7, q4 + veor q5, q6, q5 + veor q7, q7, q7 + veor q6, q6, q6 + sub r9, r9, #4 + sub r5, r5, #1 + cmn r5, #1 + bne z_begin_128x2 +z_end_128x2: + add r5, r3, r6 + vst1.64 {q4-q5}, [r5] + add r5, r4, r6 + veor q4, q4, q4 + veor q5, q5, q5 + vst1.64 {q2-q3}, [r5] + add r6, r6, #32 + veor q2, q2, q2 + veor q3, q3, q3 + cmp r6, r1 + bcc d_begin_128x2 +d_end_128x2: + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +ENDPROC(raid6_neon128x2_gen_syndrome_real) + +ENTRY(raid6_neon128x4_gen_syndrome_real) + push {r4, r5, r6, r7, r8, r9, r10, r11} + ldr r7, =raid6_neon_constants + vld1.64 {q0}, [r7] + veor q13, q13, q13 + veor q14, q14, q14 + veor q15, q15, q15 + sub r7, r0, #1 + ldr r3, [r2, r7, lsl #2] + sub r7, r7, #1 + ldr r4, [r2, r7, lsl #2] + cmp r1, #0 + beq d_end_128x4 + mov r6, #0 + sub r7, r7, #1 + add r8, r2, r7, lsl #2 + sub r7, r7, #1 + cmp r7, #0 + blt z_end_128x4 + ldr r10, [r8] +d_begin_128x4: + vld1.64 {q1-q2}, [r10]! + vorr q5, q1, q1 + vorr q6, q2, q2 + vld1.64 {q3-q4}, [r10]! + vorr q7, q3, q3 + vorr q8, q4, q4 + sub r9, r8, #4 + mov r5, r7 +z_begin_128x4: + ldr r11, [r9] + add r11, r11, r6 + vld1.64 {q9-q10}, [r11]! + vld1.64 {q11-q12}, [r11] + vcgt.s8 q13, q13, q5 + vcgt.s8 q14, q14, q6 + vcgt.s8 q15, q15, q7 + vadd.i8 q5, q5, q5 + vadd.i8 q6, q6, q6 + vadd.i8 q7, q7, q7 + vand q13, q13, q0 + vand q14, q14, q0 + vand q15, q15, q0 + veor q1, q9, q1 + veor q2, q10, q2 + veor q3, q11, q3 + veor q4, q12, q4 + veor q5, q9, q5 + veor q6, q10, q6 + veor q7, q11, q7 + veor q5, q13, q5 + veor q6, q14, q6 + veor q7, q15, q7 + veor q13, q13, q13 + veor q14, q14, q14 + veor q15, q15, q15 + vcgt.s8 q15, q15, q8 + vadd.i8 q8, q8, q8 + vand q15, q15, q0 + veor q8, q12, q8 + veor q8, q15, q8 + veor q15, q15, q15 + sub r9, r9, #4 + sub r5, r5, #1 + cmn r5, #1 + bne z_begin_128x4 +z_end_128x4: + add r5, r3, r6 + vst1.64 {q5-q6}, [r5]! + vst1.64 {q7-q8}, [r5] + add r5, r4, r6 + vst1.64 {q1-q2}, [r5]! + vst1.64 {q3-q4}, [r5] + add r6, r6, #64 + cmp r6, r1 + bcc d_begin_128x4 +d_end_128x4: + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +ENDPROC(raid6_neon128x4_gen_syndrome_real) diff --git a/lib/raid6/neon.h b/lib/raid6/neon.h new file mode 100644 index 0000000..204c93e --- /dev/null +++ b/lib/raid6/neon.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2013 Vladimir Murzin. + * + * Based on mmx.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + **/ + + +/* + * raid6/neon.h + * + * Definitions NEON RAID-6 code + */ + +#ifndef LINUX_RAID_NEON_H +#define LINUX_RAID_NEON_H + +#ifdef CONFIG_NEON + +#ifdef __KERNEL__ /* Real code */ + +#include <asm/vfp.h> + +static inline void kernel_fpu_begin(void) +{ +} + +static inline void kernel_fpu_end(void) +{ +} + + +#else /* Dummy code for user space testing */ + +static inline void kernel_fpu_begin(void) +{ +} + +static inline void kernel_fpu_end(void) +{ +} + +#endif + +extern void raid6_neon64x1_gen_syndrome_real(int d, size_t b, void **p); +extern void raid6_neon64x2_gen_syndrome_real(int d, size_t b, void **p); +extern void raid6_neon64x4_gen_syndrome_real(int d, size_t b, void **p); +extern void raid6_neon64x8_gen_syndrome_real(int d, size_t b, void **p); + +extern void raid6_neon128x1_gen_syndrome_real(int d, size_t b, void **p); +extern void raid6_neon128x2_gen_syndrome_real(int d, size_t b, void **p); +extern void raid6_neon128x4_gen_syndrome_real(int d, size_t b, void **p); + +#endif +#endif diff --git a/lib/raid6/neon_glue.c b/lib/raid6/neon_glue.c new file mode 100644 index 0000000..733373a --- /dev/null +++ b/lib/raid6/neon_glue.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2013 Vladimir Murzin. + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + **/ + +/* + * raid6/neon_glue.c + * + * Glue code for the NEON optimized version of RAID-6 syndrome functions + * + */ + +#ifdef CONFIG_NEON + +#include <linux/raid/pq.h> +#include <asm/hwcap.h> +#include "neon.h" + +static int raid6_have_neon() +{ +#ifdef __KERNEL + return elf_hwcap & HWCAP_NEON; +#else + return 1; +#endif +} + +static void raid6_neon64x1_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + kernel_fpu_begin(); + raid6_neon64x1_gen_syndrome_real(disks, bytes, ptrs); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_neon64x1 = { + raid6_neon64x1_gen_syndrome, + raid6_have_neon, + "neon64x1", + 0U +}; + +static void raid6_neon64x2_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + kernel_fpu_begin(); + raid6_neon64x2_gen_syndrome_real(disks, bytes, ptrs); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_neon64x2 = { + raid6_neon64x2_gen_syndrome, + raid6_have_neon, + "neon64x2", + 0U +}; + +static void raid6_neon64x4_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + kernel_fpu_begin(); + raid6_neon64x4_gen_syndrome_real(disks, bytes, ptrs); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_neon64x4 = { + raid6_neon64x4_gen_syndrome, + raid6_have_neon, + "neon64x4", + 0U +}; + +static void raid6_neon64x8_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + kernel_fpu_begin(); + raid6_neon64x8_gen_syndrome_real(disks, bytes, ptrs); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_neon64x8 = { + raid6_neon64x8_gen_syndrome, + raid6_have_neon, + "neon64x8", + 0U +}; + +static void raid6_neon128x1_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + kernel_fpu_begin(); + raid6_neon128x1_gen_syndrome_real(disks, bytes, ptrs); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_neon128x1 = { + raid6_neon128x1_gen_syndrome, + raid6_have_neon, + "neon128x1", + 0U +}; + +static void raid6_neon128x2_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + kernel_fpu_begin(); + raid6_neon128x2_gen_syndrome_real(disks, bytes, ptrs); + kernel_fpu_end(); + +} + +const struct raid6_calls raid6_neon128x2 = { + raid6_neon128x2_gen_syndrome, + raid6_have_neon, + "neon128x2", + 0U +}; + +static void raid6_neon128x4_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + kernel_fpu_begin(); + raid6_neon128x4_gen_syndrome_real(disks, bytes, ptrs); + kernel_fpu_end(); + +} + +const struct raid6_calls raid6_neon128x4 = { + raid6_neon128x4_gen_syndrome, + raid6_have_neon, + "neon128x4", + 0U +}; + +#endif
On Mon, 3 Jun 2013, Vladimir Murzin wrote:
Signed-off-by: Vladimir Murzin murzin.v@gmail.com.com
Please check your git setup as gmail.com.com is not a valid host.
[...]
+/*
- raid6/neon.S
- NEON implementation of RAID-6 syndrome functions
- NEON instructions operate on wide 64-bit and 128-bit vector registers,
- Despite support 128-bit vectors some cores execute with 64 bits at a time.
- Because of that 64-bit wide operations implemented too.
- */
+#include <linux/linkage.h>
+.data +.align 4 +raid6_neon_constants: +.word 0x1d1d1d1d, 0x1d1d1d1d, 0x1d1d1d1d, 0x1d1d1d1d
+.text
If the above constants are indeed constants, you should put them in the .text section instead.
Nicolas
On Mon, Jun 03, 2013 at 04:02:15PM -0400, Nicolas Pitre wrote:
On Mon, 3 Jun 2013, Vladimir Murzin wrote:
Signed-off-by: Vladimir Murzin murzin.v@gmail.com.com
Please check your git setup as gmail.com.com is not a valid host.
Fixed, thanks.
[...]
+/*
- raid6/neon.S
- NEON implementation of RAID-6 syndrome functions
- NEON instructions operate on wide 64-bit and 128-bit vector registers,
- Despite support 128-bit vectors some cores execute with 64 bits at a time.
- Because of that 64-bit wide operations implemented too.
- */
+#include <linux/linkage.h>
+.data +.align 4 +raid6_neon_constants: +.word 0x1d1d1d1d, 0x1d1d1d1d, 0x1d1d1d1d, 0x1d1d1d1d
+.text
If the above constants are indeed constants, you should put them in the .text section instead.
Yes, they are constants. Doesn't .rodata fit best for them?
Vladimir Murzin
Nicolas
On Tue, 4 Jun 2013, Vladimir Murzin wrote:
On Mon, Jun 03, 2013 at 04:02:15PM -0400, Nicolas Pitre wrote:
On Mon, 3 Jun 2013, Vladimir Murzin wrote:
+.data +.align 4 +raid6_neon_constants: +.word 0x1d1d1d1d, 0x1d1d1d1d, 0x1d1d1d1d, 0x1d1d1d1d
+.text
If the above constants are indeed constants, you should put them in the .text section instead.
Yes, they are constants. Doesn't .rodata fit best for them?
That is fine as well.
The advantage with .text is that you can get a reference to those constants via an offset on top of the pc register, e.g.:
adr r7, raid6_neon_constants ldm r7, {...}
The adr, when in range, is more efficient than a ldr as this is translated into:
add r1, pc, #(raid6_neon_constants - . - 8)
But that works only if the target label is in the same section as the code.
Nicolas
Signed-off-by: Vladimir Murzin murzin.v@gmail.com --- lib/raid6/test/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 087332d..50b80a7 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -34,6 +34,16 @@ else ifeq ($(HAS_ALTIVEC),yes) OBJS += altivec1.o altivec2.o altivec4.o altivec8.o endif + + HAS_NEON := $(shell echo -e ".fpu neon \t\n veor d0, d0, d0 \t\n" |\ + gcc -c -x assembler - >&/dev/null && \ + rm ./-.o && echo yes) + ifeq ($(HAS_NEON),yes) + OBJS += neon_glue.o neon.o + CFLAGS += -DCONFIG_NEON + SFLAGS = -D__ASSEMBLY__ -include asm/unified.h + SFLAGS += -I ../../../include -I ../../../arch/arm/include + endif endif
.c.o: @@ -88,6 +98,9 @@ int32.c: int.uc ../unroll.awk tables.c: mktables ./mktables > tables.c
+neon.o: ../neon.S + $(CC) $(SFLAGS) -c -o $@ $^ + clean: rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test
linaro-kernel@lists.linaro.org