On Wed, May 22, 2013 at 12:02:17PM +0200, Ard Biesheuvel wrote:
Add a source file xor-neon.c (which is really just the reference C implementation passed through the GCC vectorizer) and hook it up to the XOR framework.
Output captured from a Cortex-A15 @ 1.7 GHz:
xor: measuring software checksum speed arm4regs : 2261.600 MB/sec 8regs : 1771.600 MB/sec 32regs : 1441.600 MB/sec neon : 3619.600 MB/sec xor: using function: neon (3619.600 MB/sec)
As the xor_blocks() function could potentially be called from interrupt context, this implementation checks for that and reverts to the plain ARM code in that case.
Signed-off-by: Ard Biesheuvel ard.biesheuvel@linaro.org Cc: Rob Herring rob.herring@calxeda.com
This is the first of a series of three patches that I have prepared that use the NEON/VFP unit in the kernel. This one does not use NEON code explicitly, but relies on the compiler to generate it. The NEON code is kept in a separate compilation unit to make absolutely sure that none of it ever gets executed from outside a kernel_vfp_begin()/kernel_vfp_end() pair.
I have additional patches (which I will circulate later) for doing:
- RAID-6 syndrome calculations using NEON instrinsics
- bit sliced AES using NEON assembler (.S file)
This way, we have a sample of each of the various ways NEON can be used in the kernel, and others looking to do the same can use any of these as an example.
This patch depends on my earlier patch that implements kernel_vfp_begin() and kernel_vfp_end()
Why don't you post all these patches within one series. Now they appear eventually with reference to the past and future which makes people track the whole series in their mind instead of mbox.
Vladimir Murzin
-- Ard.
arch/arm/include/asm/xor.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++ arch/arm/lib/Makefile | 5 ++++ arch/arm/lib/xor-neon.c | 31 +++++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100644 arch/arm/lib/xor-neon.c
diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h index 7604673..64c4a15 100644 --- a/arch/arm/include/asm/xor.h +++ b/arch/arm/include/asm/xor.h @@ -7,7 +7,10 @@
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation.
*/ +#include <linux/hardirq.h> #include <asm-generic/xor.h> +#include <asm/hwcap.h> +#include <asm/vfp.h> #define __XOR(a1, a2) a1 ^= a2 @@ -138,4 +141,75 @@ static struct xor_block_template xor_block_arm4regs = { xor_speed(&xor_block_arm4regs); \ xor_speed(&xor_block_8regs); \ xor_speed(&xor_block_32regs); \
} while (0)NEON_TEMPLATES; \
+#ifdef CONFIG_NEON
+extern struct xor_block_template const xor_block_neon_inner;
+static void +xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +{
- if (in_interrupt()) {
xor_arm4regs_2(bytes, p1, p2);
- } else {
kernel_vfp_begin();
xor_block_neon_inner.do_2(bytes, p1, p2);
kernel_vfp_end();
- }
+}
+static void +xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
+{
- if (in_interrupt()) {
xor_arm4regs_3(bytes, p1, p2, p3);
- } else {
kernel_vfp_begin();
xor_block_neon_inner.do_3(bytes, p1, p2, p3);
kernel_vfp_end();
- }
+}
+static void +xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
+{
- if (in_interrupt()) {
xor_arm4regs_4(bytes, p1, p2, p3, p4);
- } else {
kernel_vfp_begin();
xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
kernel_vfp_end();
- }
+}
+static void +xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
- if (in_interrupt()) {
xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
- } else {
kernel_vfp_begin();
xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
kernel_vfp_end();
- }
+}
+static struct xor_block_template xor_block_neon = {
- .name = "neon",
- .do_2 = xor_neon_2,
- .do_3 = xor_neon_3,
- .do_4 = xor_neon_4,
- .do_5 = xor_neon_5
+};
+#define NEON_TEMPLATES \
- do { if (elf_hwcap & HWCAP_NEON) xor_speed(&xor_block_neon); } while (0)
+#else +#define NEON_TEMPLATES +#error +#endif diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index af72969..1951766 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -45,3 +45,8 @@ lib-$(CONFIG_ARCH_SHARK) += io-shark.o $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S
+ifeq ($(CONFIG_NEON),y)
- CFLAGS_xor-neon.o += -mfloat-abi=softfp -mfpu=neon
- lib-$(CONFIG_XOR_BLOCKS) += xor-neon.o
+endif diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c new file mode 100644 index 0000000..159beaf --- /dev/null +++ b/arch/arm/lib/xor-neon.c @@ -0,0 +1,31 @@ +/*
- linux/arch/arm/lib/xor-neon.c
- Copyright (C) 2013 Linaro Ltd ard.biesheuvel@linaro.org
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation.
- */
+#include <linux/raid/xor.h>
+#ifndef __ARM_NEON__ +#error You should compile this file with '-mfloat-abi=softfp -mfpu=neon' +#endif
+/*
- Pull in the reference implementations while instructing GCC to attempt to
- exploit implicit parallelism and emit NEON instructions.
- */
+#pragma GCC optimize "tree-vectorize" +#pragma GCC diagnostic ignored "-Wunused-variable" +#include <asm-generic/xor.h>
+struct xor_block_template const xor_block_neon_inner = {
- .name = "__inner_neon__",
- .do_2 = xor_8regs_2,
- .do_3 = xor_8regs_3,
- .do_4 = xor_8regs_4,
- .do_5 = xor_8regs_5,
+};
1.8.1.2
linaro-kernel mailing list linaro-kernel@lists.linaro.org http://lists.linaro.org/mailman/listinfo/linaro-kernel