summaryrefslogtreecommitdiff
path: root/extra
diff options
context:
space:
mode:
authorYuqi <yuqi.gu@arm.com>2019-05-24 22:20:27 +0800
committerSergey Vojtovich <svoj@mariadb.org>2019-05-24 18:20:27 +0400
commit0928596a8bd54c3ec85462eb4993464eac19f8f2 (patch)
treee36e8f59ef2b56026d89028d89e4b12a21e33fc1 /extra
parenta74b01ea0e35c9c4a007c64609380844b36d2867 (diff)
downloadmariadb-git-0928596a8bd54c3ec85462eb4993464eac19f8f2.tar.gz
Armv8 CRC32 optimization (#772)
ARMv8 (AArch64) CPUs implement the CRC32 extension which is implemented by inline assembly , so they can also benefit from hardware acceleration in IO-intensive workloads. The patch optimizes crc32c calculate with the armv8 crypto instruction(Intrinsics) when available rather than original linear crc instructions. Change-Id: I05d36a64c726d910c47befad93390108f4e6567f Signed-off-by: Yuqi Gu <yuqi.gu@arm.com>
Diffstat (limited to 'extra')
-rw-r--r--extra/crc32_armv8_neon/CMakeLists.txt8
-rw-r--r--extra/crc32_armv8_neon/crc32_armv8.c301
2 files changed, 309 insertions, 0 deletions
diff --git a/extra/crc32_armv8_neon/CMakeLists.txt b/extra/crc32_armv8_neon/CMakeLists.txt
new file mode 100644
index 00000000000..ba1d34d7c2e
--- /dev/null
+++ b/extra/crc32_armv8_neon/CMakeLists.txt
@@ -0,0 +1,8 @@
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include)
+INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)
+
+ADD_CONVENIENCE_LIBRARY(${CRC32_LIBRARY} $<TARGET_OBJECTS:common_crc32c_armv8>)
+ADD_LIBRARY(common_crc32c_armv8 OBJECT crc32_armv8.c)
+
+SET_TARGET_PROPERTIES(common_crc32c_armv8 PROPERTIES COMPILE_FLAGS "${ARMV8_CRC_COMPILE_FLAGS}")
+
diff --git a/extra/crc32_armv8_neon/crc32_armv8.c b/extra/crc32_armv8_neon/crc32_armv8.c
new file mode 100644
index 00000000000..20f341552e2
--- /dev/null
+++ b/extra/crc32_armv8_neon/crc32_armv8.c
@@ -0,0 +1,301 @@
+#include <my_global.h>
+#include <string.h>
+
+
+#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)
+
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
+
+unsigned int crc32c_aarch64_available(void)
+{
+ unsigned long auxv = getauxval(AT_HWCAP);
+ return (auxv & HWCAP_CRC32) != 0;
+}
+
+#endif
+
+#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+
+/* Request crc extension capabilities from the assembler */
+asm(".arch_extension crc");
+
+#ifdef HAVE_ARMV8_CRYPTO
+/* crypto extension */
+asm(".arch_extension crypto");
+#endif
+
+#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+
+#define CRC32C3X8(buffer, ITR) \
+ __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
+ __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
+ __asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
+
+#define CRC32C3X8_ZERO \
+ __asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));
+
+#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+/* Intrinsics header*/
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+#define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
+#define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value))
+#define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
+#define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
+
+#define CRC32C3X8(buffer, ITR) \
+ crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
+ crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
+ crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
+
+#define CRC32C3X8_ZERO \
+ crc0 = __crc32cd(crc0, (const uint64_t)0);
+
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+#define CRC32C7X3X8(buffer, ITR) do {\
+ CRC32C3X8(buffer, ((ITR) * 7 + 0)) \
+ CRC32C3X8(buffer, ((ITR) * 7 + 1)) \
+ CRC32C3X8(buffer, ((ITR) * 7 + 2)) \
+ CRC32C3X8(buffer, ((ITR) * 7 + 3)) \
+ CRC32C3X8(buffer, ((ITR) * 7 + 4)) \
+ CRC32C3X8(buffer, ((ITR) * 7 + 5)) \
+ CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
+ } while(0)
+
+#define CRC32C7X3X8_ZERO do {\
+ CRC32C3X8_ZERO \
+ CRC32C3X8_ZERO \
+ CRC32C3X8_ZERO \
+ CRC32C3X8_ZERO \
+ CRC32C3X8_ZERO \
+ CRC32C3X8_ZERO \
+ CRC32C3X8_ZERO \
+ } while(0)
+
+#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
+ __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+ __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+ __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+ __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL1(buffer, PREF_OFFSET) \
+ PREF4X64L1(buffer,(PREF_OFFSET), 0) \
+ PREF4X64L1(buffer,(PREF_OFFSET), 4) \
+ PREF4X64L1(buffer,(PREF_OFFSET), 8) \
+ PREF4X64L1(buffer,(PREF_OFFSET), 12)
+
+#define PREF4X64L2(buffer, PREF_OFFSET, ITR) \
+ __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+ __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+ __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+ __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL2(buffer, PREF_OFFSET) \
+ PREF4X64L2(buffer,(PREF_OFFSET), 0) \
+ PREF4X64L2(buffer,(PREF_OFFSET), 4) \
+ PREF4X64L2(buffer,(PREF_OFFSET), 8) \
+ PREF4X64L2(buffer,(PREF_OFFSET), 12)
+
+
+uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
+{
+ uint32_t crc0, crc1, crc2;
+ int64_t length = (int64_t)len;
+
+ crc = 0xFFFFFFFFU;
+
+ if (buffer) {
+
+/* Crypto extension Support
+ * Process 1024 Bytes (per block)
+ */
+#ifdef HAVE_ARMV8_CRYPTO
+
+/* Intrinsics Support */
+#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+ const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
+ uint64_t t0, t1;
+
+ /* Process per block size of 1024 Bytes
+ * A block size = 8 + 42*3*sizeof(uint64_t) + 8
+ */
+ while ((length -= 1024) >= 0) {
+ /* Prefetch 3*1024 data for avoiding L2 cache miss */
+ PREF1KL2(buffer, 1024*3);
+ /* Do first 8 bytes here for better pipelining */
+ crc0 = __crc32cd(crc, *(const uint64_t *)buffer);
+ crc1 = 0;
+ crc2 = 0;
+ buffer += sizeof(uint64_t);
+
+ /* Process block inline
+ * Process crc0 last to avoid dependency with above
+ */
+ CRC32C7X3X8(buffer, 0);
+ CRC32C7X3X8(buffer, 1);
+ CRC32C7X3X8(buffer, 2);
+ CRC32C7X3X8(buffer, 3);
+ CRC32C7X3X8(buffer, 4);
+ CRC32C7X3X8(buffer, 5);
+
+ buffer += 42*3*sizeof(uint64_t);
+ /* Prefetch data for following block to avoid L1 cache miss */
+ PREF1KL1(buffer, 1024);
+
+ /* Last 8 bytes
+ * Merge crc0 and crc1 into crc2
+ * crc1 multiply by K2
+ * crc0 multiply by K1
+ */
+ t1 = (uint64_t)vmull_p64(crc1, k2);
+ t0 = (uint64_t)vmull_p64(crc0, k1);
+ crc = __crc32cd(crc2, *(const uint64_t *)buffer);
+ crc1 = __crc32cd(0, t1);
+ crc ^= crc1;
+ crc0 = __crc32cd(0, t0);
+ crc ^= crc0;
+
+ buffer += sizeof(uint64_t);
+ }
+
+#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+ /*No intrinsics*/
+ __asm__("mov x16, #0xf38a \n\t"
+ "movk x16, #0xe417, lsl 16 \n\t"
+ "mov v1.2d[0], x16 \n\t"
+ "mov x16, #0x8014 \n\t"
+ "movk x16, #0x8f15, lsl 16 \n\t"
+ "mov v0.2d[0], x16 \n\t"
+ :::"x16");
+
+ while ((length -= 1024) >= 0) {
+ PREF1KL2(buffer, 1024*3);
+ __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
+ :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
+ crc1 = 0;
+ crc2 = 0;
+ buffer += sizeof(uint64_t);
+
+ CRC32C7X3X8(buffer, 0);
+ CRC32C7X3X8(buffer, 1);
+ CRC32C7X3X8(buffer, 2);
+ CRC32C7X3X8(buffer, 3);
+ CRC32C7X3X8(buffer, 4);
+ CRC32C7X3X8(buffer, 5);
+
+ buffer += 42*3*sizeof(uint64_t);
+ PREF1KL1(buffer, 1024);
+ __asm__("mov v2.2d[0], %x[c1] \n\t"
+ "pmull v2.1q, v2.1d, v0.1d \n\t"
+ "mov v3.2d[0], %x[c0] \n\t"
+ "pmull v3.1q, v3.1d, v1.1d \n\t"
+ "crc32cx %w[c], %w[c2], %x[v] \n\t"
+ "mov %x[c1], v2.2d[0] \n\t"
+ "crc32cx %w[c1], wzr, %x[c1] \n\t"
+ "eor %w[c], %w[c], %w[c1] \n\t"
+ "mov %x[c0], v3.2d[0] \n\t"
+ "crc32cx %w[c0], wzr, %x[c0] \n\t"
+ "eor %w[c], %w[c], %w[c0] \n\t"
+ :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
+ :[v]"r"(*((const uint64_t *)buffer)));
+ buffer += sizeof(uint64_t);
+ }
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+ /* Done if Input data size is aligned with 1024 */
+ if(!(length += 1024))
+ return (~crc);
+
+#endif /* HAVE_ARMV8_CRYPTO */
+
+ while ((length -= sizeof(uint64_t)) >= 0) {
+ CRC32CX(crc, *(uint64_t *)buffer);
+ buffer += sizeof(uint64_t);
+ }
+ /* The following is more efficient than the straight loop */
+ if (length & sizeof(uint32_t)) {
+ CRC32CW(crc, *(uint32_t *)buffer);
+ buffer += sizeof(uint32_t);
+ }
+ if (length & sizeof(uint16_t)) {
+ CRC32CH(crc, *(uint16_t *)buffer);
+ buffer += sizeof(uint16_t);
+ }
+ if (length & sizeof(uint8_t))
+ CRC32CB(crc, *buffer);
+
+ } else {
+#ifdef HAVE_ARMV8_CRYPTO
+#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+ const poly64_t k1 = 0xe417f38a;
+ uint64_t t0;
+ while ((length -= 1024) >= 0) {
+ crc0 = __crc32cd(crc, 0);
+
+ CRC32C7X3X8_ZERO;
+ CRC32C7X3X8_ZERO;
+ CRC32C7X3X8_ZERO;
+ CRC32C7X3X8_ZERO;
+ CRC32C7X3X8_ZERO;
+ CRC32C7X3X8_ZERO;
+
+ /* Merge crc0 into crc: crc0 multiply by K1 */
+ t0 = (uint64_t)vmull_p64(crc0, k1);
+ crc = __crc32cd(0, t0);
+ }
+#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+ __asm__("mov x16, #0xf38a \n\t"
+ "movk x16, #0xe417, lsl 16 \n\t"
+ "mov v1.2d[0], x16 \n\t"
+ :::"x16");
+
+ while ((length -= 1024) >= 0) {
+ __asm__("crc32cx %w[c0], %w[c], xzr\n\t"
+ :[c0]"=r"(crc0):[c]"r"(crc));
+
+ CRC32C7X3X8_ZERO;
+ CRC32C7X3X8_ZERO;
+ CRC32C7X3X8_ZERO;
+ CRC32C7X3X8_ZERO;
+ CRC32C7X3X8_ZERO;
+ CRC32C7X3X8_ZERO;
+
+ __asm__("mov v3.2d[0], %x[c0] \n\t"
+ "pmull v3.1q, v3.1d, v1.1d \n\t"
+ "mov %x[c0], v3.2d[0] \n\t"
+ "crc32cx %w[c], wzr, %x[c0] \n\t"
+ :[c]"=r"(crc)
+ :[c0]"r"(crc0));
+ }
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+ if(!(length += 1024))
+ return (~crc);
+#endif /* HAVE_ARMV8_CRYPTO */
+ while ((length -= sizeof(uint64_t)) >= 0)
+ CRC32CX(crc, 0);
+
+ /* The following is more efficient than the straight loop */
+ if (length & sizeof(uint32_t))
+ CRC32CW(crc, 0);
+
+ if (length & sizeof(uint16_t))
+ CRC32CH(crc, 0);
+
+ if (length & sizeof(uint8_t))
+ CRC32CB(crc, 0);
+ }
+
+ return (~crc);
+}