[RFC] ARM: crypto: add NEON accelerated XOR implementation - linaro-kernel

22 May 2013

Add a source file xor-neon.c (which is really just the reference
C implementation passed through the GCC vectorizer) and hook it
up to the XOR framework.
Output captured from a Cortex-A15 @ 1.7 GHz:
xor: measuring software checksum speed
   arm4regs  :  2261.600 MB/sec
   8regs     :  1771.600 MB/sec
   32regs    :  1441.600 MB/sec
   neon      :  3619.600 MB/sec
xor: using function: neon (3619.600 MB/sec)
As the xor_blocks() function could potentially be called from
interrupt context, this implementation checks for that and
reverts to the plain ARM code in that case.
Signed-off-by: Ard Biesheuvel ard.biesheuvel@linaro.org
Cc: Rob Herring rob.herring@calxeda.com
---
This is the first of a series of three patches that I have
prepared that use the NEON/VFP unit in the kernel. This one
does not use NEON code explicitly, but relies on the compiler
to generate it. The NEON code is kept in a separate compilation
unit to make absolutely sure that none of it ever gets executed
from outside a kernel_vfp_begin()/kernel_vfp_end() pair.
I have additional patches (which I will circulate later) for doing:
- RAID-6 syndrome calculations using NEON instrinsics
- bit sliced AES using NEON assembler (.S file)
This way, we have a sample of each of the various ways NEON can be
used in the kernel, and others looking to do the same can use any
of these as an example.
This patch depends on my earlier patch that implements 
kernel_vfp_begin() and kernel_vfp_end()
-- 
Ard.

 arch/arm/include/asm/xor.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/arm/lib/Makefile      |  5 ++++
 arch/arm/lib/xor-neon.c    | 31 +++++++++++++++++++
 3 files changed, 110 insertions(+)
 create mode 100644 arch/arm/lib/xor-neon.c

diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h
index 7604673..64c4a15 100644
--- a/arch/arm/include/asm/xor.h
+++ b/arch/arm/include/asm/xor.h
@@ -7,7 +7,10 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+#include <linux/hardirq.h>
 #include <asm-generic/xor.h>
+#include <asm/hwcap.h>
+#include <asm/vfp.h>

 #define __XOR(a1, a2) a1 ^= a2

@@ -138,4 +141,75 @@ static struct xor_block_template xor_block_arm4regs = {
    	xor_speed(&xor_block_arm4regs);	\
    	xor_speed(&xor_block_8regs);	\
    	xor_speed(&xor_block_32regs);	\
+		NEON_TEMPLATES;			\
    } while (0)
+
+#ifdef CONFIG_NEON
+
+extern struct xor_block_template const xor_block_neon_inner;
+
+static void
+xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+	if (in_interrupt()) {
+		xor_arm4regs_2(bytes, p1, p2);
+	} else {
+		kernel_vfp_begin();
+		xor_block_neon_inner.do_2(bytes, p1, p2);
+		kernel_vfp_end();
+	}
+}
+
+static void
+xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+		unsigned long *p3)
+{
+	if (in_interrupt()) {
+		xor_arm4regs_3(bytes, p1, p2, p3);
+	} else {
+		kernel_vfp_begin();
+		xor_block_neon_inner.do_3(bytes, p1, p2, p3);
+		kernel_vfp_end();
+	}
+}
+
+static void
+xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+		unsigned long *p3, unsigned long *p4)
+{
+	if (in_interrupt()) {
+		xor_arm4regs_4(bytes, p1, p2, p3, p4);
+	} else {
+		kernel_vfp_begin();
+		xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
+		kernel_vfp_end();
+	}
+}
+
+static void
+xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+		unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+	if (in_interrupt()) {
+		xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
+	} else {
+		kernel_vfp_begin();
+		xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
+		kernel_vfp_end();
+	}
+}
+
+static struct xor_block_template xor_block_neon = {
+	.name	= "neon",
+	.do_2	= xor_neon_2,
+	.do_3	= xor_neon_3,
+	.do_4	= xor_neon_4,
+	.do_5	= xor_neon_5
+};
+
+#define NEON_TEMPLATES	\
+	do { if (elf_hwcap & HWCAP_NEON) xor_speed(&xor_block_neon); } while (0)
+#else
+#define NEON_TEMPLATES
+#error
+#endif
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index af72969..1951766 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -45,3 +45,8 @@ lib-$(CONFIG_ARCH_SHARK)	+= io-shark.o

 $(obj)/csumpartialcopy.o:	$(obj)/csumpartialcopygeneric.S
 $(obj)/csumpartialcopyuser.o:	$(obj)/csumpartialcopygeneric.S
+
+ifeq ($(CONFIG_NEON),y)
+  CFLAGS_xor-neon.o		+= -mfloat-abi=softfp -mfpu=neon
+  lib-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
+endif
diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c
new file mode 100644
index 0000000..159beaf
--- /dev/null
+++ b/arch/arm/lib/xor-neon.c
@@ -0,0 +1,31 @@
+/*
+ * linux/arch/arm/lib/xor-neon.c
+ *
+ * Copyright (C) 2013 Linaro Ltd ard.biesheuvel@linaro.org
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/raid/xor.h>
+
+#ifndef __ARM_NEON__
+#error You should compile this file with '-mfloat-abi=softfp -mfpu=neon'
+#endif
+
+/*
+ * Pull in the reference implementations while instructing GCC to attempt to
+ * exploit implicit parallelism and emit NEON instructions.
+ */
+#pragma GCC optimize "tree-vectorize"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#include <asm-generic/xor.h>
+
+struct xor_block_template const xor_block_neon_inner = {
+	.name	= "__inner_neon__",
+	.do_2	= xor_8regs_2,
+	.do_3	= xor_8regs_3,
+	.do_4	= xor_8regs_4,
+	.do_5	= xor_8regs_5,
+};
-- 
1.8.1.2