I have a few comments below.
--
Steve
On Wed, Jan 21, 2015 at 02:17:47PM -0600, Yazen Ghannam wrote:
> Tested on AMD Seattle.
> Passes all crc32c unit tests.
> ~4x performance increase versus sctp.
Was that perf result from the unit test?
Also describe the reason for the patch in the commit log; we have an
optional CRC instruction that can be used instead of the Ceph in-built
table lookup. This patch uses the CRC instruction where available...
Unit tests are added and they show a ...
Probably also mention that this based off Ed Nevill's Hadoop patch.
>
> Signed-off-by: Yazen Ghannam
yazen.ghannam@linaro.org
> ---
> configure.ac | 1 +
> m4/ax_arm.m4 | 18 +++++++++++++---
> src/arch/arm.c | 2 ++
> src/arch/arm.h | 1 +
> src/common/Makefile.am | 11 ++++++++--
> src/common/crc32c.cc | 6 ++++++
> src/common/crc32c_arm64.c | 48 ++++++++++++++++++++++++++++++++++++++++++
> src/common/crc32c_arm64.h | 25 ++++++++++++++++++++++
> src/test/common/test_crc32c.cc | 9 ++++++++
> 9 files changed, 116 insertions(+), 5 deletions(-)
> create mode 100644 src/common/crc32c_arm64.c
> create mode 100644 src/common/crc32c_arm64.h
>
> diff --git a/configure.ac b/configure.ac
> index d836b02..60e4feb 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -575,6 +575,7 @@ AC_LANG_POP([C++])
> # Find supported SIMD / NEON / SSE extensions supported by the compiler
> AX_ARM_FEATURES()
> AM_CONDITIONAL(HAVE_NEON, [ test "x$ax_cv_support_neon_ext" = "xyes"])
> +AM_CONDITIONAL(HAVE_ARMV8_CRC, [ test "x$ax_cv_support_crc_ext" = "xyes"])
> AX_INTEL_FEATURES()
> AM_CONDITIONAL(HAVE_SSSE3, [ test "x$ax_cv_support_ssse3_ext" = "xyes"])
> AM_CONDITIONAL(HAVE_SSE4_PCLMUL, [ test "x$ax_cv_support_pclmuldq_ext" = "xyes"])
> diff --git a/m4/ax_arm.m4 b/m4/ax_arm.m4
> index 2ccc9a9..4ef8081 100644
> --- a/m4/ax_arm.m4
> +++ b/m4/ax_arm.m4
> @@ -13,13 +13,25 @@ AC_DEFUN([AX_ARM_FEATURES],
> fi
> ;;
> aarch64*)
> + AX_CHECK_COMPILE_FLAG(-march=armv8-a, ax_cv_support_armv8=yes, [])
> + if test x"$ax_cv_support_armv8" = x"yes"; then
> + ARM_ARCH_FLAGS="-march=armv8-a"
> + ARM_DEFINE_FLAGS="-DARCH_AARCH64"
> + fi
> AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, ax_cv_support_neon_ext=yes, [])
> if test x"$ax_cv_support_neon_ext" = x"yes"; then
> - ARM_NEON_FLAGS="-march=armv8-a+simd -DARCH_AARCH64 -DARM_NEON"
> - AC_SUBST(ARM_NEON_FLAGS)
> - ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS"
> + ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+simd"
> + ARM_DEFINE_FLAGS="$ARM_DEFINE_FLAGS -DARM_NEON"
> AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
> fi
> + AX_CHECK_COMPILE_FLAG(-march=armv8-a+crc, ax_cv_support_crc_ext=yes, [])
> + if test x"$ax_cv_support_crc_ext" = x"yes"; then
> + ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+crc"
> + AC_DEFINE(HAVE_ARMV8_CRC,,[Support ARMv8 CRC instructions])
> + fi
> + ARM_NEON_FLAGS="$ARM_ARCH_FLAGS $ARM_DEFINE_FLAGS"
> + AC_SUBST(ARM_NEON_FLAGS)
> + ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS"
So if the compiler supports CRC, then ARM_FLAGS will be set with CRC
compile options. This will also affect the jerasure codegen (which
doesn't check for the CRC HWCAP). Although this is unlikely to cause
problems, I would feel more comfortable with something like:
ARM_CRC_FLAGS
That way you disambiguate between NEON (for jerasure) and CRC.
> ;;
> esac
>
> diff --git a/src/arch/arm.c b/src/arch/arm.c
> index 93d079a..2ea97eb 100644
> --- a/src/arch/arm.c
> +++ b/src/arch/arm.c
> @@ -2,6 +2,7 @@
>
> /* flags we export */
> int ceph_arch_neon = 0;
> +int ceph_arch_arm64_crc32 = 0;
<bikeshedding-comment>
Can we rename this variable: ceph_arch_aarch64_crc32 ?
</bikeshedding-comment>
>
> #include <stdio.h>
>
> @@ -47,6 +48,7 @@ int ceph_arch_arm_probe(void)
> ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON;
> #elif __aarch64__ && __linux__
> ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD;
> + ceph_arch_arm64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32;
> #else
> if (0)
> get_hwcap(); // make compiler shut up
> diff --git a/src/arch/arm.h b/src/arch/arm.h
> index f613438..4ac716a 100644
> --- a/src/arch/arm.h
> +++ b/src/arch/arm.h
> @@ -6,6 +6,7 @@ extern "C" {
> #endif
>
> extern int ceph_arch_neon; /* true if we have ARM NEON or ASIMD abilities */
> +extern int ceph_arch_arm64_crc32; /* true if we have Aarch64 CRC32/CRC32C abilities */
"AArch64", we have two capital A's.
>
> extern int ceph_arch_arm_probe(void);
>
> diff --git a/src/common/Makefile.am b/src/common/Makefile.am
> index 2888194..4c36216 100644
> --- a/src/common/Makefile.am
> +++ b/src/common/Makefile.am
> @@ -103,12 +103,18 @@ libcommon_crc_la_SOURCES = \
> common/sctp_crc32.c \
> common/crc32c.cc \
> common/crc32c_intel_baseline.c \
> - common/crc32c_intel_fast.c
> + common/crc32c_intel_fast.c \
> + common/crc32c_arm64.c
I would recommend crc32c_aarch64.c here too.
>
> if WITH_GOOD_YASM_ELF64
> libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S common/crc32c_intel_fast_zero_asm.S
> libcommon_crc_la_LIBTOOLFLAGS = --tag=CC
> endif
> +
> +if HAVE_ARMV8_CRC
> +libcommon_crc_la_CFLAGS = $(AM_CFLAGS) $(ARM_FLAGS)
> +endif
> +
The problem with this is that all the files in the
libcommon_crc_la_SOURCES will be compiled with this flag. We want to
limit the scope to just the ARM CRC file.
> LIBCOMMON_DEPS += libcommon_crc.la
> noinst_LTLIBRARIES += libcommon_crc.la
>
> @@ -116,7 +122,8 @@ noinst_HEADERS += \
> common/bloom_filter.hpp \
> common/sctp_crc32.h \
> common/crc32c_intel_baseline.h \
> - common/crc32c_intel_fast.h
> + common/crc32c_intel_fast.h \
> + common/crc32c_arm64.h
>
>
> # important; libmsg before libauth!
> diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc
> index e2e81a4..540d431 100644
> --- a/src/common/crc32c.cc
> +++ b/src/common/crc32c.cc
> @@ -5,9 +5,11 @@
>
> #include "arch/probe.h"
> #include "arch/intel.h"
> +#include "arch/arm.h"
> #include "common/sctp_crc32.h"
> #include "common/crc32c_intel_baseline.h"
> #include "common/crc32c_intel_fast.h"
> +#include "common/crc32c_arm64.h"
Does pulling these ARM header files in affect how this compiles for
other architectures? Could you please give this a quick build on
x86?
>
> /*
> * choose best implementation based on the CPU architecture.
> @@ -24,6 +26,10 @@ ceph_crc32c_func_t ceph_choose_crc32(void)
> return ceph_crc32c_intel_fast;
> }
>
> + if (ceph_arch_arm64_crc32){
> + return ceph_crc32c_arm64;
> + }
> +
> // default
> return ceph_crc32c_sctp;
> }
> diff --git a/src/common/crc32c_arm64.c b/src/common/crc32c_arm64.c
> new file mode 100644
> index 0000000..6a754fb
> --- /dev/null
> +++ b/src/common/crc32c_arm64.c
> @@ -0,0 +1,48 @@
> +#include "acconfig.h"
> +#include "include/int_types.h"
> +#include "common/crc32c_arm64.h"
> +
> +#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
> +#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
> +#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
> +#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
> +
This won't likely build for other architectures.
I would recommend something like:
#ifdef __aarch64__
CRC routines
#else
empty functions
#endif /* __arch64__ */
> +uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, unsigned len)
> +{
> + int64_t length = len;
> +
> + if (!buffer) {
> +
> + while ((length -= sizeof(uint64_t)) >= 0)
> + CRC32CX(crc, 0);
> +
> + /* The following is more efficient than the straight loop */
> + if (length & sizeof(uint32_t))
> + CRC32CW(crc, 0);
> +
> + if (length & sizeof(uint16_t))
> + CRC32CH(crc, 0);
> +
> + if (length & sizeof(uint8_t))
> + CRC32CB(crc, 0);
> + } else {
> + while ((length -= sizeof(uint64_t)) >= 0) {
> + CRC32CX(crc, *(uint64_t *)buffer);
> + buffer += sizeof(uint64_t);
> + }
> +
> + /* The following is more efficient than the straight loop */
> + if (length & sizeof(uint32_t)) {
> + CRC32CW(crc, *(uint32_t *)buffer);
> + buffer += sizeof(uint32_t);
> + }
> + if (length & sizeof(uint16_t)) {
> + CRC32CH(crc, *(uint16_t *)buffer);
> + buffer += sizeof(uint16_t);
> + }
> + if (length & sizeof(uint8_t))
> + CRC32CB(crc, *buffer);
> + }
> + return crc;
> +}
> +
> diff --git a/src/common/crc32c_arm64.h b/src/common/crc32c_arm64.h
> new file mode 100644
> index 0000000..100b5e4
> --- /dev/null
> +++ b/src/common/crc32c_arm64.h
> @@ -0,0 +1,25 @@
> +#ifndef CEPH_COMMON_CRC32C_ARM64_H
> +#define CEPH_COMMON_CRC32C_ARM64_H
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#ifdef __aarch64__
> +
> +extern uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, unsigned len);
> +
> +#else
> +
> +static inline uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, unsigned len)
> +{
> + return 0;
> +}
> +
> +#endif
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc
> index b4297c6..1a9b2e2 100644
> --- a/src/test/common/test_crc32c.cc
> +++ b/src/test/common/test_crc32c.cc
> @@ -13,6 +13,7 @@
>
> #include "common/sctp_crc32.h"
> #include "common/crc32c_intel_baseline.h"
> +#include "common/crc32c_arm64.h"
>
> TEST(Crc32c, Small) {
> const char *a = "foo bar baz";
> @@ -80,6 +81,14 @@ TEST(Crc32c, Performance) {
> std::cout << "intel baseline = " << rate << " MB/sec" << std::endl;
> ASSERT_EQ(261108528u, val);
> }
> + {
> + utime_t start = ceph_clock_now(NULL);
> + unsigned val = ceph_crc32c_arm64(0, (unsigned char *)a, len);
> + utime_t end = ceph_clock_now(NULL);
> + float rate = (float)len / (float)(1024*1024) / (float)(end - start);
> + std::cout << "arm64 baseline = " << rate << " MB/sec" << std::endl;
> + ASSERT_EQ(261108528u, val);
> + }
>
What about x86? :-)
> }
>
> --
> 2.2.0
>