Merge 10.5 into 10.6

author: Marko Mäkelä <marko.makela@mariadb.com> 2020-09-24 10:21:26 +0300
committer: Marko Mäkelä <marko.makela@mariadb.com> 2020-09-24 10:21:26 +0300
commit: 6ce0a6f9ad77e7934e27db1b73d6d98064352928 (patch)
tree: 351d7da0892c9a78310ffc39754c3ec4b38a188e /mysys
parent: b5c050563b1bfa1155b3b6a3b7c0c59775e77f13 (diff)
parent: 882ce206dbf06b771ffe4cbce2e3e4214982f302 (diff)
download: mariadb-git-6ce0a6f9ad77e7934e27db1b73d6d98064352928.tar.gz
13 files changed, 2395 insertions, 1230 deletions
diff --git a/mysys/CMakeLists.txt b/mysys/CMakeLists.txt
index 3be4bc1b103..e7fd75b5359 100644
--- a/mysys/CMakeLists.txt
+++ b/mysys/CMakeLists.txt
@@ -16,7 +16,7 @@
 
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/mysys)
 
-SET(MYSYS_SOURCES  array.c charset-def.c charset.c checksum.c my_default.c
+SET(MYSYS_SOURCES  array.c charset-def.c charset.c crc32ieee.cc my_default.c
                 get_password.c
 				errors.c hash.c list.c
                                 mf_cache.c mf_dirname.c mf_fn_ext.c
@@ -45,7 +45,7 @@ SET(MYSYS_SOURCES  array.c charset-def.c charset.c checksum.c my_default.c
                                 my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c ../sql-common/my_time.c
 				my_rdtsc.c psi_noop.c
                                 my_atomic_writes.c my_cpu.c my_likely.c my_largepage.c
-                                file_logger.c my_dlerror.c)
+                                file_logger.c my_dlerror.c   crc32/crc32c.cc)
 
 IF (WIN32)
   SET (MYSYS_SOURCES ${MYSYS_SOURCES}
@@ -58,20 +58,24 @@ IF (WIN32)
     my_win_popen.cc)
 ENDIF()
 
-IF(NOT MSVC AND CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
-  #Check for PCLMUL instruction (x86)
-  CHECK_C_SOURCE_COMPILES("
-  int main()
-  {
-    asm volatile (\"pclmulqdq \\$0x00, %%xmm1, %%xmm0\":::\"cc\");
-    return 0;
-  }" HAVE_CLMUL_INSTRUCTION)
-
-  IF(HAVE_CLMUL_INSTRUCTION)
+IF(MSVC)
+  SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c)
+  ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL)
+  IF(CLANG_CL)
+    SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.cc crc32/crc32c.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
+  ENDIF()
+ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686")
+  MY_CHECK_C_COMPILER_FLAG(-msse4.2)
+  MY_CHECK_C_COMPILER_FLAG(-mpclmul)
+  CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
+  CHECK_INCLUDE_FILE(x86intrin.h HAVE_X86INTRIN_H)
+  IF(have_C__msse4.2 AND have_C__mpclmul AND HAVE_CPUID_H AND HAVE_X86INTRIN_H)
     SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c)
+    SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c crc32/crc32c.cc PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
+    ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL)
   ENDIF()
 ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
-  IF(CMAKE_COMPILER_IS_GNUCC AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1)
+  IF(CMAKE_COMPILER_IS_GNUCC)
     include(CheckCXXSourceCompiles)
 
     CHECK_CXX_SOURCE_COMPILES("
@@ -93,23 +97,29 @@ ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
     #include <sys/auxv.h>
     int main() { foo(0); getauxval(AT_HWCAP); }" HAVE_ARMV8_CRYPTO)
 
-    CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
-    IF(HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
+    CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_MARCH)
+
+    IF(HAVE_ARMV8_CRC_CRYPTO_MARCH)
+      CHECK_INCLUDE_FILE(arm_acle.h HAVE_ARM_ACLE_H -march=armv8-a+crc+crypto)
+      IF(HAVE_ARM_ACLE_H)
+       ADD_DEFINITIONS(-DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
+      ENDIF()
+      IF(HAVE_ARMV8_CRC)
+        ADD_DEFINITIONS(-DHAVE_ARMV8_CRC)
+      ENDIF()
+      IF(HAVE_ARMV8_CRYPTO)
+        ADD_DEFINITIONS(-DHAVE_ARMV8_CRYPTO)
+      ENDIF()
       SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_arm64.c)
       SET_SOURCE_FILES_PROPERTIES(crc32/crc32_arm64.c PROPERTIES
         COMPILE_FLAGS "-march=armv8-a+crc+crypto")
     ENDIF()
   ENDIF()
 ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64")
-  SET(HAVE_CRC32_VPMSUM 1 PARENT_SCOPE)
-  SET(MYSYS_SOURCES ${MYSYS_SOURCES} $<TARGET_OBJECTS:crc32c> $<TARGET_OBJECTS:crc32ieee>)
-
-  ADD_LIBRARY(crc32c OBJECT crc32/crc32_ppc64.c)
-  ADD_LIBRARY(crc32ieee OBJECT crc32/crc32_ppc64.c)
-
-  SET_TARGET_PROPERTIES(crc32c crc32ieee PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -maltivec -mvsx -mpower8-vector -mcrypto -mpower8-vector")
-  SET_TARGET_PROPERTIES(crc32ieee PROPERTIES COMPILE_DEFINITIONS "CRC32_FUNCTION=my_checksum;CRC32_CONSTANTS_HEADER=\"pcc_crc32_constants.h\"")
-  SET_TARGET_PROPERTIES(crc32c PROPERTIES COMPILE_DEFINITIONS "CRC32_FUNCTION=crc32c_vpmsum;CRC32_CONSTANTS_HEADER=\"pcc_crc32c_constants.h\"")
+  SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_ppc64.c crc32/crc32c_ppc.c)
+  SET_SOURCE_FILES_PROPERTIES(crc32/crc32_ppc64.c crc32/crc32c_ppc.c PROPERTIES
+        COMPILE_FLAGS "${COMPILE_FLAGS} -maltivec -mvsx -mpower8-vector -mcrypto -mpower8-vector")
+  ADD_DEFINITIONS(-DHAVE_POWER8 -DHAS_ALTIVEC)
 ENDIF()
 
 IF(UNIX)
diff --git a/mysys/crc32/crc32_arm64.c b/mysys/crc32/crc32_arm64.c
index a7eb2a47442..b82d4701e6f 100644
--- a/mysys/crc32/crc32_arm64.c
+++ b/mysys/crc32/crc32_arm64.c
@@ -57,6 +57,12 @@ asm(".arch_extension crypto");
 #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
 #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
 
+#define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+
+
 #define CRC32C3X8(buffer, ITR) \
   __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
   __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
@@ -73,6 +79,11 @@ asm(".arch_extension crypto");
 #define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
 #define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
 
+#define CRC32X(crc, value) (crc) = __crc32d((crc), (value))
+#define CRC32W(crc, value) (crc) = __crc32w((crc), (value))
+#define CRC32H(crc, value) (crc) = __crc32h((crc), (value))
+#define CRC32B(crc, value) (crc) = __crc32b((crc), (value))
+
 #define CRC32C3X8(buffer, ITR) \
   crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
   crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
@@ -119,7 +130,7 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
   uint32_t crc0, crc1, crc2;
   int64_t length= (int64_t)len;
 
-  crc= 0xFFFFFFFFU;
+  crc^= 0xffffffff;
 
   /* Pmull runtime check here.
    * Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030).
@@ -282,16 +293,16 @@ unsigned int crc32_aarch64(unsigned int crc, const void *buf, size_t len)
   /* if start pointer is not 8 bytes aligned */
   while ((buf1 != (const uint8_t *) buf8) && len)
   {
-    crc= __crc32b(crc, *buf1++);
+    CRC32B(crc, *buf1++);
     len--;
   }
 
   for (; len >= 8; len-= 8)
-    crc= __crc32d(crc, *buf8++);
+    CRC32X(crc, *buf8++);
 
   buf1= (const uint8_t *) buf8;
   while (len--)
-    crc= __crc32b(crc, *buf1++);
+    CRC32B(crc, *buf1++);
 
   return ~crc;
 }
diff --git a/mysys/crc32/crc32_ppc64.c b/mysys/crc32/crc32_ppc64.c
index 2e8b9fc1b12..76df88ee231 100644
--- a/mysys/crc32/crc32_ppc64.c
+++ b/mysys/crc32/crc32_ppc64.c
@@ -1,675 +1,5 @@
-/*
- * Calculate the checksum of data that is 16 byte aligned and a multiple of
- * 16 bytes.
- *
- * The first step is to reduce it to 1024 bits. We do this in 8 parallel
- * chunks in order to mask the latency of the vpmsum instructions. If we
- * have more than 32 kB of data to checksum we repeat this step multiple
- * times, passing in the previous 1024 bits.
- *
- * The next step is to reduce the 1024 bits to 64 bits. This step adds
- * 32 bits of 0s to the end - this matches what a CRC does. We just
- * calculate constants that land the data in this 32 bits.
- *
- * We then use fixed point Barrett reduction to compute a mod n over GF(2)
- * for n = CRC using POWER8 instructions. We use x = 32.
- *
- * http://en.wikipedia.org/wiki/Barrett_reduction
- *
- * This code uses gcc vector builtins instead using assembly directly.
- *
- * Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of either:
- *
- *  a) the GNU General Public License as published by the Free Software
- *     Foundation; either version 2 of the License, or (at your option)
- *     any later version, or
- *  b) the Apache License, Version 2.0
- */
-
-#include <altivec.h>
-
-#define POWER8_INTRINSICS
+#define CRC32_FUNCTION my_checksum
 #define CRC_TABLE
-
-#ifdef CRC32_CONSTANTS_HEADER
-#include CRC32_CONSTANTS_HEADER
-#else
-#include "crc32_constants.h"
-#endif
-
-#define VMX_ALIGN	16
-#define VMX_ALIGN_MASK	(VMX_ALIGN-1)
-
-#ifdef REFLECT
-static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
-			       unsigned long len)
-{
-	while (len--)
-		crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
-	return crc;
-}
-#else
-static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
-				unsigned long len)
-{
-	while (len--)
-		crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8);
-	return crc;
-}
-#endif
-
-static unsigned int __attribute__ ((aligned (32)))
-__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
-
-#ifndef CRC32_FUNCTION
-#define CRC32_FUNCTION  crc32_vpmsum
-#endif
-
-unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p,
-			    unsigned long len)
-{
-	unsigned int prealign;
-	unsigned int tail;
-
-#ifdef CRC_XOR
-	crc ^= 0xffffffff;
-#endif
-
-	if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
-		crc = crc32_align(crc, p, len);
-		goto out;
-	}
-
-	if ((unsigned long)p & VMX_ALIGN_MASK) {
-		prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
-		crc = crc32_align(crc, p, prealign);
-		len -= prealign;
-		p += prealign;
-	}
-
-	crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
-
-	tail = len & VMX_ALIGN_MASK;
-	if (tail) {
-		p += len & ~VMX_ALIGN_MASK;
-		crc = crc32_align(crc, p, tail);
-	}
-
-out:
-#ifdef CRC_XOR
-	crc ^= 0xffffffff;
-#endif
-
-	return crc;
-}
-
-#if defined (__clang__)
-#include "clang_workaround.h"
-#else
-#define __builtin_pack_vector(a, b)  __builtin_pack_vector_int128 ((a), (b))
-#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0)
-#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1)
-#endif
-
-/* When we have a load-store in a single-dispatch group and address overlap
- * such that foward is not allowed (load-hit-store) the group must be flushed.
- * A group ending NOP prevents the flush.
- */
-#define GROUP_ENDING_NOP asm("ori 2,2,0" ::: "memory")
-
-#if defined(__BIG_ENDIAN__) && defined (REFLECT)
-#define BYTESWAP_DATA
-#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
-#define BYTESWAP_DATA
-#endif
-
-#ifdef BYTESWAP_DATA
-#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb,\
-			(__vector unsigned char) vc)
-#if defined(__LITTLE_ENDIAN__)
-/* Byte reverse permute constant LE. */
-static const __vector unsigned long long vperm_const
-	__attribute__ ((aligned(16))) = { 0x08090A0B0C0D0E0FUL,
-			0x0001020304050607UL };
-#else
-static const __vector unsigned long long vperm_const
-	__attribute__ ((aligned(16))) = { 0x0F0E0D0C0B0A0908UL,
-			0X0706050403020100UL };
-#endif
-#else
-#define VEC_PERM(vr, va, vb, vc)
-#endif
-
-static unsigned int __attribute__ ((aligned (32)))
-__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
-
-	const __vector unsigned long long vzero = {0,0};
-	const __vector unsigned long long vones = {0xffffffffffffffffUL,
-		0xffffffffffffffffUL};
-
-#ifdef REFLECT
-	__vector unsigned char vsht_splat;
-	const __vector unsigned long long vmask_32bit =
-		(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
-			(__vector unsigned char)vones, 4);
-#endif
-
-	const __vector unsigned long long vmask_64bit =
-		(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
-			(__vector unsigned char)vones, 8);
-
-	__vector unsigned long long vcrc;
-
-	__vector unsigned long long vconst1, vconst2;
-
-	/* vdata0-vdata7 will contain our data (p). */
-	__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4,
-		vdata5, vdata6, vdata7;
-
-	/* v0-v7 will contain our checksums */
-	__vector unsigned long long v0 = {0,0};
-	__vector unsigned long long v1 = {0,0};
-	__vector unsigned long long v2 = {0,0};
-	__vector unsigned long long v3 = {0,0};
-	__vector unsigned long long v4 = {0,0};
-	__vector unsigned long long v5 = {0,0};
-	__vector unsigned long long v6 = {0,0};
-	__vector unsigned long long v7 = {0,0};
-
-
-	/* Vector auxiliary variables. */
-	__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
-
-	unsigned int result = 0;
-	unsigned int offset; /* Constant table offset. */
-
-	unsigned long i; /* Counter. */
-	unsigned long chunks;
-
-	unsigned long block_size;
-	int next_block = 0;
-
-	/* Align by 128 bits. The last 128 bit block will be processed at end. */
-	unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
-
-#ifdef REFLECT
-	vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc);
-#else
-	vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL);
-
-	/* Shift into top 32 bits */
-	vcrc = (__vector unsigned long long)vec_sld((__vector unsigned char)vcrc,
-        (__vector unsigned char)vzero, 4);
-#endif
-
-	/* Short version. */
-	if (len < 256) {
-		/* Calculate where in the constant table we need to start. */
-		offset = 256 - len;
-
-		vconst1 = vec_ld(offset, vcrc_short_const);
-		vdata0 = vec_ld(0, (__vector unsigned long long*) p);
-		VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
-
-		/* xor initial value*/
-		vdata0 = vec_xor(vdata0, vcrc);
-
-		vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
-				((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
-		v0 = vec_xor(v0, vdata0);
-
-		for (i = 16; i < len; i += 16) {
-			vconst1 = vec_ld(offset + i, vcrc_short_const);
-			vdata0 = vec_ld(i, (__vector unsigned long long*) p);
-			VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
-			vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
-				((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
-			v0 = vec_xor(v0, vdata0);
-		}
-	} else {
-
-		/* Load initial values. */
-		vdata0 = vec_ld(0, (__vector unsigned long long*) p);
-		vdata1 = vec_ld(16, (__vector unsigned long long*) p);
-
-		VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
-		VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
-
-		vdata2 = vec_ld(32, (__vector unsigned long long*) p);
-		vdata3 = vec_ld(48, (__vector unsigned long long*) p);
-
-		VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
-		VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
-
-		vdata4 = vec_ld(64, (__vector unsigned long long*) p);
-		vdata5 = vec_ld(80, (__vector unsigned long long*) p);
-
-		VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
-		VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
-
-		vdata6 = vec_ld(96, (__vector unsigned long long*) p);
-		vdata7 = vec_ld(112, (__vector unsigned long long*) p);
-
-		VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
-		VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
-
-		/* xor in initial value */
-		vdata0 = vec_xor(vdata0, vcrc);
-
-		p = (char *)p + 128;
-
-		do {
-			/* Checksum in blocks of MAX_SIZE. */
-			block_size = length;
-			if (block_size > MAX_SIZE) {
-				block_size = MAX_SIZE;
-			}
-
-			length = length - block_size;
-
-			/*
-			* Work out the offset into the constants table to start at. Each
-			* constant is 16 bytes, and it is used against 128 bytes of input
-			* data - 128 / 16 = 8
-			*/
-			offset = (MAX_SIZE/8) - (block_size/8);
-			/* We reduce our final 128 bytes in a separate step */
-			chunks = (block_size/128)-1;
-
-		    vconst1 = vec_ld(offset, vcrc_const);
-
-			va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0,
-						(__vector unsigned long long)vconst1);
-			va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1,
-						(__vector unsigned long long)vconst1);
-			va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2,
-						(__vector unsigned long long)vconst1);
-			va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3,
-						(__vector unsigned long long)vconst1);
-			va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4,
-						(__vector unsigned long long)vconst1);
-			va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5,
-						(__vector unsigned long long)vconst1);
-			va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6,
-						(__vector unsigned long long)vconst1);
-			va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7,
-						(__vector unsigned long long)vconst1);
-
-			if (chunks > 1) {
-				offset += 16;
-				vconst2 = vec_ld(offset, vcrc_const);
-				GROUP_ENDING_NOP;
-
-				vdata0 = vec_ld(0, (__vector unsigned long long*) p);
-				VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
-
-				vdata1 = vec_ld(16, (__vector unsigned long long*) p);
-				VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
-
-				vdata2 = vec_ld(32, (__vector unsigned long long*) p);
-				VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
-
-				vdata3 = vec_ld(48, (__vector unsigned long long*) p);
-				VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
-
-				vdata4 = vec_ld(64, (__vector unsigned long long*) p);
-				VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
-
-				vdata5 = vec_ld(80, (__vector unsigned long long*) p);
-				VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
-
-				vdata6 = vec_ld(96, (__vector unsigned long long*) p);
-				VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
-
-				vdata7 = vec_ld(112, (__vector unsigned long long*) p);
-				VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
-
-				p = (char *)p + 128;
-
-				/*
-				 * main loop. We modulo schedule it such that it takes three
-				 * iterations to complete - first iteration load, second
-				 * iteration vpmsum, third iteration xor.
-				 */
-				for (i = 0; i < chunks-2; i++) {
-					vconst1 = vec_ld(offset, vcrc_const);
-					offset += 16;
-					GROUP_ENDING_NOP;
-
-					v0 = vec_xor(v0, va0);
-					va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata0, (__vector unsigned long long)vconst2);
-					vdata0 = vec_ld(0, (__vector unsigned long long*) p);
-					VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v1 = vec_xor(v1, va1);
-					va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata1, (__vector unsigned long long)vconst2);
-					vdata1 = vec_ld(16, (__vector unsigned long long*) p);
-					VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v2 = vec_xor(v2, va2);
-					va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata2, (__vector unsigned long long)vconst2);
-					vdata2 = vec_ld(32, (__vector unsigned long long*) p);
-					VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v3 = vec_xor(v3, va3);
-					va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata3, (__vector unsigned long long)vconst2);
-					vdata3 = vec_ld(48, (__vector unsigned long long*) p);
-					VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
-
-					vconst2 = vec_ld(offset, vcrc_const);
-					GROUP_ENDING_NOP;
-
-					v4 = vec_xor(v4, va4);
-					va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata4, (__vector unsigned long long)vconst1);
-					vdata4 = vec_ld(64, (__vector unsigned long long*) p);
-					VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v5 = vec_xor(v5, va5);
-					va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata5, (__vector unsigned long long)vconst1);
-					vdata5 = vec_ld(80, (__vector unsigned long long*) p);
-					VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v6 = vec_xor(v6, va6);
-					va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata6, (__vector unsigned long long)vconst1);
-					vdata6 = vec_ld(96, (__vector unsigned long long*) p);
-					VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
-					GROUP_ENDING_NOP;
-
-					v7 = vec_xor(v7, va7);
-					va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata7, (__vector unsigned long long)vconst1);
-					vdata7 = vec_ld(112, (__vector unsigned long long*) p);
-					VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
-
-					p = (char *)p + 128;
-				}
-
-				/* First cool down*/
-				vconst1 = vec_ld(offset, vcrc_const);
-				offset += 16;
-
-				v0 = vec_xor(v0, va0);
-				va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata0, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v1 = vec_xor(v1, va1);
-				va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata1, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v2 = vec_xor(v2, va2);
-				va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata2, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v3 = vec_xor(v3, va3);
-				va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata3, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v4 = vec_xor(v4, va4);
-				va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata4, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v5 = vec_xor(v5, va5);
-				va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata5, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v6 = vec_xor(v6, va6);
-				va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata6, (__vector unsigned long long)vconst1);
-				GROUP_ENDING_NOP;
-
-				v7 = vec_xor(v7, va7);
-				va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
-							long)vdata7, (__vector unsigned long long)vconst1);
-			}/* else */
-
-			/* Second cool down. */
-			v0 = vec_xor(v0, va0);
-			v1 = vec_xor(v1, va1);
-			v2 = vec_xor(v2, va2);
-			v3 = vec_xor(v3, va3);
-			v4 = vec_xor(v4, va4);
-			v5 = vec_xor(v5, va5);
-			v6 = vec_xor(v6, va6);
-			v7 = vec_xor(v7, va7);
-
-#ifdef REFLECT
-			/*
-			 * vpmsumd produces a 96 bit result in the least significant bits
-			 * of the register. Since we are bit reflected we have to shift it
-			 * left 32 bits so it occupies the least significant bits in the
-			 * bit reflected domain.
-			 */
-			v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
-					(__vector unsigned char)vzero, 4);
-			v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
-					(__vector unsigned char)vzero, 4);
-			v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
-					(__vector unsigned char)vzero, 4);
-			v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
-					(__vector unsigned char)vzero, 4);
-			v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
-					(__vector unsigned char)vzero, 4);
-			v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
-					(__vector unsigned char)vzero, 4);
-			v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
-					(__vector unsigned char)vzero, 4);
-			v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
-					(__vector unsigned char)vzero, 4);
-#endif
-
-			/* xor with the last 1024 bits. */
-			va0 = vec_ld(0, (__vector unsigned long long*) p);
-			VEC_PERM(va0, va0, va0, vperm_const);
-
-			va1 = vec_ld(16, (__vector unsigned long long*) p);
-			VEC_PERM(va1, va1, va1, vperm_const);
-
-			va2 = vec_ld(32, (__vector unsigned long long*) p);
-			VEC_PERM(va2, va2, va2, vperm_const);
-
-			va3 = vec_ld(48, (__vector unsigned long long*) p);
-			VEC_PERM(va3, va3, va3, vperm_const);
-
-			va4 = vec_ld(64, (__vector unsigned long long*) p);
-			VEC_PERM(va4, va4, va4, vperm_const);
-
-			va5 = vec_ld(80, (__vector unsigned long long*) p);
-			VEC_PERM(va5, va5, va5, vperm_const);
-
-			va6 = vec_ld(96, (__vector unsigned long long*) p);
-			VEC_PERM(va6, va6, va6, vperm_const);
-
-			va7 = vec_ld(112, (__vector unsigned long long*) p);
-			VEC_PERM(va7, va7, va7, vperm_const);
-
-			p = (char *)p + 128;
-
-			vdata0 = vec_xor(v0, va0);
-			vdata1 = vec_xor(v1, va1);
-			vdata2 = vec_xor(v2, va2);
-			vdata3 = vec_xor(v3, va3);
-			vdata4 = vec_xor(v4, va4);
-			vdata5 = vec_xor(v5, va5);
-			vdata6 = vec_xor(v6, va6);
-			vdata7 = vec_xor(v7, va7);
-
-			/* Check if we have more blocks to process */
-			next_block = 0;
-			if (length != 0) {
-				next_block = 1;
-
-			    /* zero v0-v7 */
-				v0 = vec_xor(v0, v0);
-				v1 = vec_xor(v1, v1);
-				v2 = vec_xor(v2, v2);
-				v3 = vec_xor(v3, v3);
-				v4 = vec_xor(v4, v4);
-				v5 = vec_xor(v5, v5);
-				v6 = vec_xor(v6, v6);
-				v7 = vec_xor(v7, v7);
-			}
-			length = length + 128;
-
-		} while (next_block);
-
-		/* Calculate how many bytes we have left. */
-		length = (len & 127);
-
-		/* Calculate where in (short) constant table we need to start. */
-		offset = 128 - length;
-
-		v0 = vec_ld(offset, vcrc_short_const);
-		v1 = vec_ld(offset + 16, vcrc_short_const);
-		v2 = vec_ld(offset + 32, vcrc_short_const);
-		v3 = vec_ld(offset + 48, vcrc_short_const);
-		v4 = vec_ld(offset + 64, vcrc_short_const);
-		v5 = vec_ld(offset + 80, vcrc_short_const);
-		v6 = vec_ld(offset + 96, vcrc_short_const);
-		v7 = vec_ld(offset + 112, vcrc_short_const);
-
-		offset += 128;
-
-		v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata0,(__vector unsigned int)v0);
-		v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata1,(__vector unsigned int)v1);
-		v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata2,(__vector unsigned int)v2);
-		v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata3,(__vector unsigned int)v3);
-		v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata4,(__vector unsigned int)v4);
-		v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata5,(__vector unsigned int)v5);
-		v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata6,(__vector unsigned int)v6);
-		v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata7,(__vector unsigned int)v7);
-
-		/* Now reduce the tail (0-112 bytes). */
-		for (i = 0; i < length; i+=16) {
-			vdata0 = vec_ld(i,(__vector unsigned long long*)p);
-			VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
-			va0 = vec_ld(offset + i,vcrc_short_const);
-			va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
-			(__vector unsigned int)vdata0,(__vector unsigned int)va0);
-			v0 = vec_xor(v0, va0);
-		}
-
-		/* xor all parallel chunks together. */
-		v0 = vec_xor(v0, v1);
-		v2 = vec_xor(v2, v3);
-		v4 = vec_xor(v4, v5);
-		v6 = vec_xor(v6, v7);
-
-		v0 = vec_xor(v0, v2);
-		v4 = vec_xor(v4, v6);
-
-		v0 = vec_xor(v0, v4);
-	}
-
-	/* Barrett Reduction */
-	vconst1 = vec_ld(0, v_Barrett_const);
-	vconst2 = vec_ld(16, v_Barrett_const);
-
-	v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
-			(__vector unsigned char)v0, 8);
-	v0 = vec_xor(v1,v0);
-
-#ifdef REFLECT
-	/* shift left one bit */
-	vsht_splat = vec_splat_u8 (1);
-	v0 = (__vector unsigned long long)vec_sll ((__vector unsigned char)v0,
-			vsht_splat);
-#endif
-
-	v0 = vec_and(v0, vmask_64bit);
-
-#ifndef REFLECT
-
-	/*
-	 * Now for the actual algorithm. The idea is to calculate q,
-	 * the multiple of our polynomial that we need to subtract. By
-	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
-	 * result back down 2x bits, we round down to the nearest multiple.
-	 */
-
-	/* ma */
-	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0,
-			(__vector unsigned long long)vconst1);
-	/* q = floor(ma/(2^64)) */
-	v1 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero,
-			(__vector unsigned char)v1, 8);
-	/* qn */
-	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
-			(__vector unsigned long long)vconst2);
-	/* a - qn, subtraction is xor in GF(2) */
-	v0 = vec_xor (v0, v1);
-	/*
-	 * Get the result into r3. We need to shift it left 8 bytes:
-	 * V0 [ 0 1 2 X ]
-	 * V0 [ 0 X 2 3 ]
-	 */
-	result = __builtin_unpack_vector_1 (v0);
-#else
-
-	/*
-	 * The reflected version of Barrett reduction. Instead of bit
-	 * reflecting our data (which is expensive to do), we bit reflect our
-	 * constants and our algorithm, which means the intermediate data in
-	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
-	 * the algorithm because we don't carry in mod 2 arithmetic.
-	 */
-
-	/* bottom 32 bits of a */
-	v1 = vec_and(v0, vmask_32bit);
-
-	/* ma */
-	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
-			(__vector unsigned long long)vconst1);
-
-	/* bottom 32bits of ma */
-	v1 = vec_and(v1, vmask_32bit);
-	/* qn */
-	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
-			(__vector unsigned long long)vconst2);
-	/* a - qn, subtraction is xor in GF(2) */
-	v0 = vec_xor (v0, v1);
-
-	/*
-	 * Since we are bit reflected, the result (ie the low 32 bits) is in
-	 * the high 32 bits. We just need to shift it left 4 bytes
-	 * V0 [ 0 1 X 3 ]
-	 * V0 [ 0 X 2 3 ]
-	 */
-
-	/* shift result into top 64 bits of */
-	v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
-        (__vector unsigned char)vzero, 4);
-
-	result = __builtin_unpack_vector_0 (v0);
-#endif
-
-	return result;
-}
+#define POWER8_INTRINSICS
+#include "pcc_crc32_constants.h"
+#include "crc_ppc64.h"
diff --git a/mysys/crc32/crc32_x86.c b/mysys/crc32/crc32_x86.c
index 3f176a6c145..1e5d2a0a089 100644
--- a/mysys/crc32/crc32_x86.c
+++ b/mysys/crc32/crc32_x86.c
@@ -1,545 +1,358 @@
-/******************************************************
-Copyright (c) 2017 Percona LLC and/or its affiliates.
+/* Copyright (c) 2020 MariaDB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
+
+/*
+  Implementation of CRC32 (Ethernet) uing Intel PCLMULQDQ
+  Ported from Intels work, see https://github.com/intel/soft-crc
+*/
+
+/*******************************************************************************
+ Copyright (c) 2009-2018, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of Intel Corporation nor the names of its contributors
+       may be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
 
-CRC32 using Intel's PCLMUL instruction.
 
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; version 2 of the License.
+#include <my_global.h>
+#include <my_compiler.h>
 
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stddef.h>
 
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA
+#if defined(__GNUC__)
+#include <x86intrin.h>
+#include <cpuid.h>
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#else
+#error "unknown compiler"
+#endif
 
-*******************************************************/
+static int has_sse42_and_pclmul(uint32_t recx)
+{
+  /* 1 << 20 is SSE42, 1 << 1 is PCLMULQDQ */
+#define bits_SSE42_AND_PCLMUL (1 << 20 | 1 << 1)
+  return (recx & bits_SSE42_AND_PCLMUL) == bits_SSE42_AND_PCLMUL;
+}
 
-/* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation
- * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+#ifdef __GNUC__
+int crc32_pclmul_enabled(void)
+{
+  uint32_t reax= 0, rebx= 0, recx= 0, redx= 0;
+  __cpuid(1, reax, rebx, recx, redx);
+  return has_sse42_and_pclmul(recx);
+}
+#elif defined(_MSC_VER)
+int crc32_pclmul_enabled(void)
+{
+  int regs[4];
+  __cpuid(regs, 1);
+  return has_sse42_and_pclmul(regs[2]);
+}
+#endif
+
+/**
+ * @brief Shifts left 128 bit register by specified number of bytes
  *
- * This file is part of Libgcrypt.
+ * @param reg 128 bit value
+ * @param num number of bytes to shift left \a reg by (0-16)
  *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
+ * @return \a reg << (\a num * 8)
+ */
+static inline __m128i xmm_shift_left(__m128i reg, const unsigned int num)
+{
+  static const MY_ALIGNED(16) uint8_t crc_xmm_shift_tab[48]= {
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+      0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+  const __m128i *p= (const __m128i *) (crc_xmm_shift_tab + 16 - num);
+
+  return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
+}
+
+struct crcr_pclmulqdq_ctx
+{
+  uint64_t rk1;
+  uint64_t rk2;
+  uint64_t rk5;
+  uint64_t rk6;
+  uint64_t rk7;
+  uint64_t rk8;
+};
+
+/**
+ * @brief Performs one folding round
  *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
  *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
  *
+ * @return New 16 byte folded data
  */
+static inline __m128i crcr32_folding_round(const __m128i data_block,
+                                    const __m128i precomp, const __m128i fold)
+{
+  __m128i tmp0= _mm_clmulepi64_si128(fold, precomp, 0x01);
+  __m128i tmp1= _mm_clmulepi64_si128(fold, precomp, 0x10);
 
-#include <my_global.h>
+  return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
+}
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
+/**
+ * @brief Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline __m128i crcr32_reduce_128_to_64(__m128i data128, const __m128i precomp)
+{
+  __m128i tmp0, tmp1, tmp2;
 
-#  define U64_C(c) (c ## UL)
+  /* 64b fold */
+  tmp0= _mm_clmulepi64_si128(data128, precomp, 0x00);
+  tmp1= _mm_srli_si128(data128, 8);
+  tmp0= _mm_xor_si128(tmp0, tmp1);
 
-typedef uint32_t u32;
-typedef uint16_t u16;
-typedef uint64_t u64;
-#ifndef byte
-typedef uint8_t byte;
-#endif
+  /* 32b fold */
+  tmp2= _mm_slli_si128(tmp0, 4);
+  tmp1= _mm_clmulepi64_si128(tmp2, precomp, 0x10);
 
-# define _gcry_bswap32 __builtin_bswap32
+  return _mm_xor_si128(tmp1, tmp0);
+}
 
-#if __GNUC__ >= 4 && defined(__x86_64__)
+/**
+ * @brief Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t crcr32_reduce_64_to_32(__m128i data64, const __m128i precomp)
+{
+  static const MY_ALIGNED(16) uint32_t mask1[4]= {
+      0xffffffff, 0xffffffff, 0x00000000, 0x00000000};
+  static const MY_ALIGNED(16) uint32_t mask2[4]= {
+      0x00000000, 0xffffffff, 0xffffffff, 0xffffffff};
+  __m128i tmp0, tmp1, tmp2;
 
-#if defined(_GCRY_GCC_VERSION) && _GCRY_GCC_VERSION >= 40400 /* 4.4 */
-/* Prevent compiler from issuing SSE instructions between asm blocks. */
-#  pragma GCC target("no-sse")
-#endif
+  tmp0= _mm_and_si128(data64, _mm_load_si128((__m128i *) mask2));
 
+  tmp1= _mm_clmulepi64_si128(tmp0, precomp, 0x00);
+  tmp1= _mm_xor_si128(tmp1, tmp0);
+  tmp1= _mm_and_si128(tmp1, _mm_load_si128((__m128i *) mask1));
 
-#define ALIGNED_16 __attribute__ ((aligned (16)))
+  tmp2= _mm_clmulepi64_si128(tmp1, precomp, 0x10);
+  tmp2= _mm_xor_si128(tmp2, tmp1);
+  tmp2= _mm_xor_si128(tmp2, tmp0);
 
+  return _mm_extract_epi32(tmp2, 2);
+}
 
-struct u16_unaligned_s
+/**
+ * @brief Calculates reflected 32-bit CRC for given \a data block
+ *        by applying folding and reduction methods.
+ *
+ * Algorithm operates on 32 bit CRCs.
+ * Polynomials and initial values may need to be promoted to
+ * 32 bits where required.
+ *
+ * @param crc initial CRC value (32 bit value)
+ * @param data pointer to data block
+ * @param data_len length of \a data block in bytes
+ * @param params pointer to PCLMULQDQ CRC calculation context
+ *
+ * @return CRC for given \a data block (32 bits wide).
+ */
+static inline uint32_t crcr32_calc_pclmulqdq(const uint8_t *data, uint32_t data_len,
+                                      uint32_t crc,
+                                      const struct crcr_pclmulqdq_ctx *params)
 {
-  u16 a;
-} __attribute__((packed, aligned (1), may_alias));
+  __m128i temp, fold, k;
+  uint32_t n;
 
+  DBUG_ASSERT(data != NULL || data_len == 0);
+  DBUG_ASSERT(params);
 
-/* Constants structure for generic reflected/non-reflected CRC32 CLMUL
- * functions. */
-struct crc32_consts_s
-{
-  /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
-  u64 k[6];
-  /* my_p: { floor(x^64 / P(x)), P(x) } */
-  u64 my_p[2];
-};
+  if (unlikely(data_len == 0))
+    return crc;
 
+  /**
+   * Get CRC init value
+   */
+  temp= _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
 
-/* CLMUL constants for CRC32 and CRC32RFC1510. */
-static const struct crc32_consts_s crc32_consts ALIGNED_16 =
-{
-  { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
-    U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
-    U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
-    U64_C(0x163cd6124), 0                   /* y = 2 */
-  },
-  { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
-    U64_C(0x1f7011641), U64_C(0x1db710641)
-  }
-};
+  /**
+   * -------------------------------------------------
+   * Folding all data into single 16 byte data block
+   * Assumes: \a fold holds first 16 bytes of data
+   */
 
-/* Common constants for CRC32 algorithms. */
-static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 =
-  {
-    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-  };
-static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 =
-  {
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-  };
-static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 =
+  if (unlikely(data_len < 32))
   {
-    { U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f) }, /* 9 */
-    { U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e) },
-    { U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d) },
-    { U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c) },
-    { U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b) },
-    { U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a) },
-    { U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09) }, /* 15 */
-  };
-static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 =
-  {
-    { U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff) }, /* 5 */
-    { U64_C(0xffff070603020100), U64_C(0xffffffffffffffff) },
-    { U64_C(0xff07060503020100), U64_C(0xffffffffffffffff) }, /* 7 */
-  };
-
-/* PCLMUL functions for reflected CRC32. */
-static inline void
-crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
-		      const struct crc32_consts_s *consts)
-{
-  if (inlen >= 8 * 16)
+    if (unlikely(data_len == 16))
     {
-      asm volatile ("movd %[crc], %%xmm4\n\t"
-		    "movdqu %[inbuf_0], %%xmm0\n\t"
-		    "movdqu %[inbuf_1], %%xmm1\n\t"
-		    "movdqu %[inbuf_2], %%xmm2\n\t"
-		    "movdqu %[inbuf_3], %%xmm3\n\t"
-		    "pxor %%xmm4, %%xmm0\n\t"
-		    :
-		    : [inbuf_0] "m" (inbuf[0 * 16]),
-		      [inbuf_1] "m" (inbuf[1 * 16]),
-		      [inbuf_2] "m" (inbuf[2 * 16]),
-		      [inbuf_3] "m" (inbuf[3 * 16]),
-		      [crc] "m" (*pcrc)
-		    );
-
-      inbuf += 4 * 16;
-      inlen -= 4 * 16;
-
-      asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
-		    :
-		    : [k1k2] "m" (consts->k[1 - 1])
-		    );
-
-      /* Fold by 4. */
-      while (inlen >= 4 * 16)
-	{
-	  asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t"
-			"movdqa %%xmm0, %%xmm6\n\t"
-			"pclmulqdq $0x00, %%xmm4, %%xmm0\n\t"
-			"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
-			"pxor %%xmm5, %%xmm0\n\t"
-			"pxor %%xmm6, %%xmm0\n\t"
-
-			"movdqu %[inbuf_1], %%xmm5\n\t"
-			"movdqa %%xmm1, %%xmm6\n\t"
-			"pclmulqdq $0x00, %%xmm4, %%xmm1\n\t"
-			"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
-			"pxor %%xmm5, %%xmm1\n\t"
-			"pxor %%xmm6, %%xmm1\n\t"
-
-			"movdqu %[inbuf_2], %%xmm5\n\t"
-			"movdqa %%xmm2, %%xmm6\n\t"
-			"pclmulqdq $0x00, %%xmm4, %%xmm2\n\t"
-			"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
-			"pxor %%xmm5, %%xmm2\n\t"
-			"pxor %%xmm6, %%xmm2\n\t"
-
-			"movdqu %[inbuf_3], %%xmm5\n\t"
-			"movdqa %%xmm3, %%xmm6\n\t"
-			"pclmulqdq $0x00, %%xmm4, %%xmm3\n\t"
-			"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
-			"pxor %%xmm5, %%xmm3\n\t"
-			"pxor %%xmm6, %%xmm3\n\t"
-			:
-			: [inbuf_0] "m" (inbuf[0 * 16]),
-			  [inbuf_1] "m" (inbuf[1 * 16]),
-			  [inbuf_2] "m" (inbuf[2 * 16]),
-			  [inbuf_3] "m" (inbuf[3 * 16])
-			);
-
-	  inbuf += 4 * 16;
-	  inlen -= 4 * 16;
-	}
-
-      asm volatile ("movdqa %[k3k4], %%xmm6\n\t"
-		    "movdqa %[my_p], %%xmm5\n\t"
-		    :
-		    : [k3k4] "m" (consts->k[3 - 1]),
-		      [my_p] "m" (consts->my_p[0])
-		    );
-
-      /* Fold 4 to 1. */
-
-      asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
-		    "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
-		    "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
-		    "pxor %%xmm1, %%xmm0\n\t"
-		    "pxor %%xmm4, %%xmm0\n\t"
-
-		    "movdqa %%xmm0, %%xmm4\n\t"
-		    "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
-		    "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
-		    "pxor %%xmm2, %%xmm0\n\t"
-		    "pxor %%xmm4, %%xmm0\n\t"
-
-		    "movdqa %%xmm0, %%xmm4\n\t"
-		    "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
-		    "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
-		    "pxor %%xmm3, %%xmm0\n\t"
-		    "pxor %%xmm4, %%xmm0\n\t"
-		    :
-		    :
-		    );
+      /* 16 bytes */
+      fold= _mm_loadu_si128((__m128i *) data);
+      fold= _mm_xor_si128(fold, temp);
+      goto reduction_128_64;
     }
-  else
+    if (unlikely(data_len < 16))
     {
-      asm volatile ("movd %[crc], %%xmm1\n\t"
-		    "movdqu %[inbuf], %%xmm0\n\t"
-		    "movdqa %[k3k4], %%xmm6\n\t"
-		    "pxor %%xmm1, %%xmm0\n\t"
-		    "movdqa %[my_p], %%xmm5\n\t"
-		    :
-		    : [inbuf] "m" (*inbuf),
-		      [crc] "m" (*pcrc),
-		      [k3k4] "m" (consts->k[3 - 1]),
-		      [my_p] "m" (consts->my_p[0])
-		    );
-
-      inbuf += 16;
-      inlen -= 16;
+      /* 0 to 15 bytes */
+      MY_ALIGNED(16) uint8_t buffer[16];
+
+      memset(buffer, 0, sizeof(buffer));
+      memcpy(buffer, data, data_len);
+
+      fold= _mm_load_si128((__m128i *) buffer);
+      fold= _mm_xor_si128(fold, temp);
+      if ((data_len < 4))
+      {
+        fold= xmm_shift_left(fold, 8 - data_len);
+        goto barret_reduction;
+      }
+      fold= xmm_shift_left(fold, 16 - data_len);
+      goto reduction_128_64;
     }
+    /* 17 to 31 bytes */
+    fold= _mm_loadu_si128((__m128i *) data);
+    fold= _mm_xor_si128(fold, temp);
+    n= 16;
+    k= _mm_load_si128((__m128i *) (&params->rk1));
+    goto partial_bytes;
+  }
 
-  /* Fold by 1. */
-  if (inlen >= 16)
-    {
-      while (inlen >= 16)
-	{
-	  /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */
-	  asm volatile ("movdqu %[inbuf], %%xmm2\n\t"
-			"movdqa %%xmm0, %%xmm1\n\t"
-			"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
-			"pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
-			"pxor %%xmm2, %%xmm0\n\t"
-			"pxor %%xmm1, %%xmm0\n\t"
-			:
-			: [inbuf] "m" (*inbuf)
-			);
-
-	  inbuf += 16;
-	  inlen -= 16;
-	}
-    }
+  /**
+   * At least 32 bytes in the buffer
+   */
+
+  /**
+   * Apply CRC initial value
+   */
+  fold= _mm_loadu_si128((const __m128i *) data);
+  fold= _mm_xor_si128(fold, temp);
+
+  /**
+   * Main folding loop
+   * - the last 16 bytes is processed separately
+   */
+  k= _mm_load_si128((__m128i *) (&params->rk1));
+  for (n= 16; (n + 16) <= data_len; n+= 16)
+  {
+    temp= _mm_loadu_si128((__m128i *) &data[n]);
+    fold= crcr32_folding_round(temp, k, fold);
+  }
 
-  /* Partial fold. */
-  if (inlen)
-    {
-      /* Load last input and add padding zeros. */
-      asm volatile ("movdqu %[shr_shuf], %%xmm3\n\t"
-		    "movdqu %[shl_shuf], %%xmm4\n\t"
-		    "movdqu %[mask], %%xmm2\n\t"
-
-		    "movdqa %%xmm0, %%xmm1\n\t"
-		    "pshufb %%xmm4, %%xmm0\n\t"
-		    "movdqu %[inbuf], %%xmm4\n\t"
-		    "pshufb %%xmm3, %%xmm1\n\t"
-		    "pand %%xmm4, %%xmm2\n\t"
-		    "por %%xmm1, %%xmm2\n\t"
-
-		    "movdqa %%xmm0, %%xmm1\n\t"
-		    "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
-		    "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
-		    "pxor %%xmm2, %%xmm0\n\t"
-		    "pxor %%xmm1, %%xmm0\n\t"
-		    :
-		    : [inbuf] "m" (*(inbuf - 16 + inlen)),
-		      [mask] "m" (crc32_partial_fold_input_mask[inlen]),
-		      [shl_shuf] "m" (crc32_refl_shuf_shift[inlen]),
-		      [shr_shuf] "m" (crc32_refl_shuf_shift[inlen + 16])
-		    );
-
-      inbuf += inlen;
-      inlen -= inlen;
-    }
+partial_bytes:
+  if (likely(n < data_len))
+  {
+    static const MY_ALIGNED(16) uint32_t mask3[4]= {0x80808080, 0x80808080,
+                                                   0x80808080, 0x80808080};
+    static const MY_ALIGNED(16) uint8_t shf_table[32]= {
+        0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
+        0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+        0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+    __m128i last16, a, b;
 
-  /* Final fold. */
-  asm volatile (/* reduce 128-bits to 96-bits */
-		"movdqa %%xmm0, %%xmm1\n\t"
-		"pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
-		"psrldq $8, %%xmm1\n\t"
-		"pxor %%xmm1, %%xmm0\n\t"
-
-		/* reduce 96-bits to 64-bits */
-		"pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
-		"pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
-		"pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
-		"pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
-
-		/* barrett reduction */
-		"pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
-		"pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
-		"pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
-		"pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
-		"pxor %%xmm1, %%xmm0\n\t"
-
-		/* store CRC */
-		"pextrd $2, %%xmm0, %[out]\n\t"
-		: [out] "=m" (*pcrc)
-		: [k5] "m" (consts->k[5 - 1])
-	        );
-}
+    last16= _mm_loadu_si128((const __m128i *) &data[data_len - 16]);
 
-static inline void
-crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
-			      const struct crc32_consts_s *consts)
-{
-  if (inlen < 4)
-    {
-      u32 crc = *pcrc;
-      u32 data;
-
-      asm volatile ("movdqa %[my_p], %%xmm5\n\t"
-		    :
-		    : [my_p] "m" (consts->my_p[0])
-		    );
-
-      if (inlen == 1)
-	{
-	  data = inbuf[0];
-	  data ^= crc;
-	  data <<= 24;
-	  crc >>= 8;
-	}
-      else if (inlen == 2)
-	{
-	  data = ((const struct u16_unaligned_s *)inbuf)->a;
-	  data ^= crc;
-	  data <<= 16;
-	  crc >>= 16;
-	}
-      else
-	{
-	  data = ((const struct u16_unaligned_s *)inbuf)->a;
-	  data |= ((u32) inbuf[2]) << 16;
-	  data ^= crc;
-	  data <<= 8;
-	  crc >>= 24;
-	}
-
-      /* Barrett reduction */
-      asm volatile ("movd %[in], %%xmm0\n\t"
-		    "movd %[crc], %%xmm1\n\t"
-
-		    "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
-		    "psllq $32, %%xmm1\n\t"
-		    "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
-		    "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
-		    "pxor %%xmm1, %%xmm0\n\t"
-
-		    "pextrd $1, %%xmm0, %[out]\n\t"
-		    : [out] "=m" (*pcrc)
-		    : [in] "rm" (data),
-		      [crc] "rm" (crc)
-		    );
-    }
-  else if (inlen == 4)
-    {
-      /* Barrett reduction */
-      asm volatile ("movd %[crc], %%xmm1\n\t"
-		    "movd %[in], %%xmm0\n\t"
-		    "movdqa %[my_p], %%xmm5\n\t"
-		    "pxor %%xmm1, %%xmm0\n\t"
-
-		    "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
-		    "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
-		    "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
-
-		    "pextrd $1, %%xmm0, %[out]\n\t"
-		    : [out] "=m" (*pcrc)
-		    : [in] "m" (*inbuf),
-		      [crc] "m" (*pcrc),
-		      [my_p] "m" (consts->my_p[0])
-		    );
-    }
-  else
-    {
-      asm volatile ("movdqu %[shuf], %%xmm4\n\t"
-		    "movd %[crc], %%xmm1\n\t"
-		    "movdqa %[my_p], %%xmm5\n\t"
-		    "movdqa %[k3k4], %%xmm6\n\t"
-		    :
-		    : [shuf] "m" (crc32_refl_shuf_shift[inlen]),
-		      [crc] "m" (*pcrc),
-		      [my_p] "m" (consts->my_p[0]),
-		      [k3k4] "m" (consts->k[3 - 1])
-		    );
-
-      if (inlen >= 8)
-	{
-	  asm volatile ("movq %[inbuf], %%xmm0\n\t"
-			:
-			: [inbuf] "m" (*inbuf)
-			);
-	  if (inlen > 8)
-	    {
-	      asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/
-			    "movq %[inbuf_tail], %%xmm2\n\t"
-			    "punpcklqdq %%xmm2, %%xmm0\n\t"
-			    "pshufb %[merge_shuf], %%xmm0\n\t"
-			    :
-			    : [inbuf_tail] "m" (inbuf[inlen - 8]),
-			      [merge_shuf] "m"
-				(*crc32_merge9to15_shuf[inlen - 9])
-			    );
-	    }
-	}
-      else
-	{
-	  asm volatile ("movd %[inbuf], %%xmm0\n\t"
-			"pinsrd $1, %[inbuf_tail], %%xmm0\n\t"
-			"pshufb %[merge_shuf], %%xmm0\n\t"
-			:
-			: [inbuf] "m" (*inbuf),
-			  [inbuf_tail] "m" (inbuf[inlen - 4]),
-			  [merge_shuf] "m"
-			    (*crc32_merge5to7_shuf[inlen - 5])
-			);
-	}
-
-      /* Final fold. */
-      asm volatile ("pxor %%xmm1, %%xmm0\n\t"
-		    "pshufb %%xmm4, %%xmm0\n\t"
-
-		    /* reduce 128-bits to 96-bits */
-		    "movdqa %%xmm0, %%xmm1\n\t"
-		    "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
-		    "psrldq $8, %%xmm1\n\t"
-		    "pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */
-
-		    /* reduce 96-bits to 64-bits */
-		    "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
-		    "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
-		    "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
-		    "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
-
-		    /* barrett reduction */
-		    "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
-		    "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
-		    "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
-		    "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
-		    "pxor %%xmm1, %%xmm0\n\t"
-
-		    /* store CRC */
-		    "pextrd $2, %%xmm0, %[out]\n\t"
-		    : [out] "=m" (*pcrc)
-		    : [k5] "m" (consts->k[5 - 1])
-		    );
-    }
-}
+    temp= _mm_loadu_si128((const __m128i *) &shf_table[data_len & 15]);
+    a= _mm_shuffle_epi8(fold, temp);
 
-void
-crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
-{
-  const struct crc32_consts_s *consts = &crc32_consts;
-#if defined(__x86_64__) && defined(__WIN64__)
-  char win64tmp[2 * 16];
-
-  /* XMM6-XMM7 need to be restored after use. */
-  asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
-                "movdqu %%xmm7, 1*16(%0)\n\t"
-                :
-                : "r" (win64tmp)
-                : "memory");
-#endif
+    temp= _mm_xor_si128(temp, _mm_load_si128((const __m128i *) mask3));
+    b= _mm_shuffle_epi8(fold, temp);
+    b= _mm_blendv_epi8(b, last16, temp);
 
-  if (!inlen)
-    return;
-
-  if (inlen >= 16)
-    crc32_reflected_bulk(pcrc, inbuf, inlen, consts);
-  else
-    crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts);
-
-#if defined(__x86_64__) && defined(__WIN64__)
-  /* Restore used registers. */
-  asm volatile("movdqu 0*16(%0), %%xmm6\n\t"
-               "movdqu 1*16(%0), %%xmm7\n\t"
-               :
-               : "r" (win64tmp)
-               : "memory");
-#endif
-}
+    /* k = rk1 & rk2 */
+    temp= _mm_clmulepi64_si128(a, k, 0x01);
+    fold= _mm_clmulepi64_si128(a, k, 0x10);
 
-#ifdef __GNUC__
-int crc32_pclmul_enabled(void)
-{
-  int eax, ecx;
-  /* We assume that the CPUID instruction and its parameter 1 are available.
-  We do not support any precursors of the Intel 80486. */
-  asm("cpuid" : "=a"(eax), "=c"(ecx) : "0"(1) : "ebx", "edx");
-  return !(~ecx & (1 << 19 | 1 << 1));
-}
-#elif 0 /* defined _MSC_VER */ /* FIXME: implement the pclmul interface */
-#include <intrin.h>
-int crc32_pclmul_enabled(void)
-{
-  /* We assume that the CPUID instruction and its parameter 1 are available.
-  We do not support any precursors of the Intel 80486. */
-  int regs[4];
-  __cpuid(regs, 1);
-  return !(~regs[2] & (1 << 19 | 1 << 1));
-}
-#else
-int crc32_pclmul_enabled(void)
-{
-  return 0;
+    fold= _mm_xor_si128(fold, temp);
+    fold= _mm_xor_si128(fold, b);
+  }
+
+  /**
+   * -------------------------------------------------
+   * Reduction 128 -> 32
+   * Assumes: \a fold holds 128bit folded data
+   */
+reduction_128_64:
+  k= _mm_load_si128((__m128i *) (&params->rk5));
+  fold= crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+  k= _mm_load_si128((__m128i *) (&params->rk7));
+  n= crcr32_reduce_64_to_32(fold, k);
+  return n;
 }
-#endif
 
+static const MY_ALIGNED(16) struct crcr_pclmulqdq_ctx ether_crc32_clmul= {
+    0xccaa009e,  /**< rk1 */
+    0x1751997d0, /**< rk2 */
+    0xccaa009e,  /**< rk5 */
+    0x163cd6124, /**< rk6 */
+    0x1f7011640, /**< rk7 */
+    0x1db710641  /**< rk8 */
+};
+
+/**
+ * @brief Calculates Ethernet CRC32 using PCLMULQDQ method.
+ *
+ * @param data pointer to data block to calculate CRC for
+ * @param data_len size of data block
+ *
+ * @return New CRC value
+ */
 unsigned int crc32_pclmul(unsigned int crc32, const void *buf, size_t len)
 {
-  crc32= ~crc32;
-  crc32_intel_pclmul(&crc32, buf, len);
-  return ~crc32;
+  return ~crcr32_calc_pclmulqdq(buf, (uint32_t)len, ~crc32, &ether_crc32_clmul);
 }
-#endif
diff --git a/mysys/crc32/crc32c.cc b/mysys/crc32/crc32c.cc
new file mode 100644
index 00000000000..4eaceb8c438
--- /dev/null
+++ b/mysys/crc32/crc32c.cc
@@ -0,0 +1,1254 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A portable implementation of crc32c, optimized to handle
+// four bytes at a time.
+
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <my_global.h>
+#include <my_byteorder.h>
+static inline uint32_t DecodeFixed32(const char *ptr)
+{
+  return uint4korr(ptr);
+}
+
+static inline uint64_t DecodeFixed64(const char *ptr)
+{
+  return uint8korr(ptr);
+}
+
+#include <stdint.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#ifdef HAVE_SSE42
+#include <nmmintrin.h>
+#include <wmmintrin.h>
+#ifdef __GNUC__
+#include <cpuid.h>
+#endif
+#endif
+
+
+#ifdef __powerpc64__
+#include "crc32c_ppc.h"
+
+#if __linux__
+#include <sys/auxv.h>
+
+#ifndef PPC_FEATURE2_VEC_CRYPTO
+#define PPC_FEATURE2_VEC_CRYPTO 0x02000000
+#endif
+
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+
+#endif /* __linux__ */
+
+#endif
+
+namespace mysys_namespace {
+namespace crc32c {
+
+#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
+#ifdef __powerpc64__
+static int arch_ppc_crc32 = 0;
+#endif /* __powerpc64__ */
+#endif
+
+static const uint32_t table0_[256] = {
+  0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+  0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+  0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+  0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+  0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+  0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+  0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+  0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+  0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+  0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+  0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+  0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+  0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+  0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+  0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+  0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+  0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+  0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+  0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+  0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+  0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+  0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+  0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+  0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+  0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+  0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+  0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+  0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+  0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+  0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+  0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+  0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+  0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+  0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+  0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+  0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+  0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+  0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+  0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+  0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+  0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+  0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+  0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+  0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+  0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+  0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+  0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+  0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+  0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+  0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+  0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+  0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+  0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+  0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+  0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+  0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+  0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+  0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+  0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+  0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+  0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+  0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+  0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+  0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
+};
+static const uint32_t table1_[256] = {
+  0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899,
+  0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
+  0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21,
+  0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
+  0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918,
+  0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+  0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0,
+  0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
+  0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b,
+  0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
+  0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823,
+  0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+  0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a,
+  0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
+  0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2,
+  0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
+  0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d,
+  0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+  0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25,
+  0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
+  0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c,
+  0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
+  0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4,
+  0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+  0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f,
+  0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
+  0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27,
+  0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
+  0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e,
+  0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+  0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6,
+  0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
+  0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260,
+  0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
+  0x66d73941, 0x7575a136, 0x419209af, 0x523091d8,
+  0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+  0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1,
+  0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
+  0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059,
+  0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
+  0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162,
+  0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+  0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da,
+  0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
+  0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3,
+  0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
+  0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b,
+  0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+  0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464,
+  0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
+  0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc,
+  0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
+  0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5,
+  0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+  0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d,
+  0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
+  0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766,
+  0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
+  0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de,
+  0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+  0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7,
+  0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
+  0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f,
+  0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
+};
+static const uint32_t table2_[256] = {
+  0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073,
+  0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
+  0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6,
+  0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
+  0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9,
+  0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+  0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c,
+  0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
+  0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67,
+  0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
+  0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2,
+  0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+  0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed,
+  0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
+  0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828,
+  0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
+  0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa,
+  0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+  0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f,
+  0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
+  0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20,
+  0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
+  0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5,
+  0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+  0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe,
+  0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
+  0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b,
+  0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
+  0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634,
+  0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+  0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1,
+  0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
+  0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730,
+  0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
+  0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5,
+  0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+  0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba,
+  0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
+  0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f,
+  0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
+  0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24,
+  0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+  0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1,
+  0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
+  0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae,
+  0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
+  0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b,
+  0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+  0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9,
+  0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
+  0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c,
+  0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
+  0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63,
+  0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+  0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6,
+  0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
+  0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd,
+  0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
+  0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238,
+  0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+  0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177,
+  0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
+  0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2,
+  0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
+};
+static const uint32_t table3_[256] = {
+  0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939,
+  0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
+  0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf,
+  0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
+  0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804,
+  0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+  0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2,
+  0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
+  0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2,
+  0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
+  0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54,
+  0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+  0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f,
+  0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
+  0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69,
+  0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
+  0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de,
+  0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+  0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538,
+  0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
+  0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3,
+  0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
+  0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405,
+  0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+  0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255,
+  0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
+  0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3,
+  0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
+  0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368,
+  0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+  0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e,
+  0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
+  0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006,
+  0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
+  0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0,
+  0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+  0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b,
+  0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
+  0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd,
+  0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
+  0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d,
+  0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+  0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b,
+  0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
+  0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0,
+  0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
+  0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656,
+  0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+  0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1,
+  0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
+  0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07,
+  0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
+  0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc,
+  0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+  0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a,
+  0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
+  0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a,
+  0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
+  0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c,
+  0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+  0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57,
+  0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
+  0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1,
+  0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
+};
+
+// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
+static inline uint32_t LE_LOAD32(const uint8_t *p) {
+  return DecodeFixed32(reinterpret_cast<const char*>(p));
+}
+
+#if defined(HAVE_SSE42) && (SIZEOF_SIZE_T == 8)
+static inline uint64_t LE_LOAD64(const uint8_t *p) {
+  return DecodeFixed64(reinterpret_cast<const char*>(p));
+}
+#endif
+
+static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
+  uint32_t c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
+  *p += 4;
+  *l = table3_[c & 0xff] ^
+  table2_[(c >> 8) & 0xff] ^
+  table1_[(c >> 16) & 0xff] ^
+  table0_[c >> 24];
+  // DO it twice.
+  c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
+  *p += 4;
+  *l = table3_[c & 0xff] ^
+  table2_[(c >> 8) & 0xff] ^
+  table1_[(c >> 16) & 0xff] ^
+  table0_[c >> 24];
+}
+
+static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
+#ifndef HAVE_SSE42
+  Slow_CRC32(l, p);
+#elif (SIZEOF_SIZE_T == 8)
+  *l = _mm_crc32_u64(*l, LE_LOAD64(*p));
+  *p += 8;
+#else
+  *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
+  *p += 4;
+  *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
+  *p += 4;
+#endif
+}
+
+template<void (*CRC32)(uint64_t*, uint8_t const**)>
+uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
+
+  const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
+  const uint8_t *e = p + size;
+  uint64_t l = crc ^ 0xffffffffu;
+
+// Align n to (1 << m) byte boundary
+#define ALIGN(n, m)     ((n + ((1 << m) - 1)) & ~((1 << m) - 1))
+
+#define STEP1 do {                              \
+    int c = (l & 0xff) ^ *p++;                  \
+    l = table0_[c] ^ (l >> 8);                  \
+} while (0)
+
+
+  // Point x at first 16-byte aligned byte in string.  This might be
+  // just past the end of the string.
+  const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
+  const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4));
+  if (x <= e) {
+    // Process bytes until finished or p is 16-byte aligned
+    while (p != x) {
+      STEP1;
+    }
+  }
+  // Process bytes 16 at a time
+  while ((e-p) >= 16) {
+    CRC32(&l, &p);
+    CRC32(&l, &p);
+  }
+  // Process bytes 8 at a time
+  while ((e-p) >= 8) {
+    CRC32(&l, &p);
+  }
+  // Process the last few bytes
+  while (p != e) {
+    STEP1;
+  }
+#undef STEP1
+#undef ALIGN
+  return static_cast<uint32_t>(l ^ 0xffffffffu);
+}
+
+// Detect if ARM64 CRC or not.
+#ifndef HAVE_ARMV8_CRC
+// Detect if SS42 or not.
+#ifndef HAVE_POWER8
+
+static bool isSSE42() {
+#ifndef HAVE_SSE42
+  return false;
+#elif defined(__GNUC__)
+  uint32_t reax= 0, rebx= 0, recx= 0, redx= 0;
+  __cpuid(1, reax, rebx, recx, redx);
+  return (recx & ((int)1 << 20)) != 0;
+#elif defined(_MSC_VER)
+  int info[4];
+  __cpuid(info, 0x00000001);
+  return (info[2] & ((int)1 << 20)) != 0;
+#else
+  return false;
+#endif
+}
+
+#ifdef HAVE_SSE42
+extern "C" int crc32_pclmul_enabled();
+#endif
+
+static bool isPCLMULQDQ() {
+#ifdef HAVE_SSE42
+  return crc32_pclmul_enabled();
+#else
+  return false;
+#endif
+}
+
+#endif  // HAVE_POWER8
+#endif  // HAVE_ARMV8_CRC
+
+typedef uint32_t (*Function)(uint32_t, const char*, size_t);
+
+#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
+uint32_t ExtendPPCImpl(uint32_t crc, const char *buf, size_t size) {
+  return crc32c_ppc(crc, (const unsigned char *)buf, size);
+}
+
+#if __linux__
+static int arch_ppc_probe(void) {
+  arch_ppc_crc32 = 0;
+
+#if defined(__powerpc64__)
+  if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1;
+#endif /* __powerpc64__ */
+
+  return arch_ppc_crc32;
+}
+#endif  // __linux__
+
+static bool isAltiVec() {
+  if (arch_ppc_probe()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+#endif
+
+#if defined(HAVE_ARMV8_CRC)
+extern "C" const char *crc32c_aarch64_available(void);
+extern "C" uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len);
+
+static uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) {
+  return crc32c_aarch64(crc, (const unsigned char *)buf, (size_t) size);
+}
+#endif
+
+extern "C" const char * my_crc32c_implementation()
+{
+#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
+  if (arch_ppc_probe())
+    return "Using POWER8 crc32 instructions";
+#elif defined(HAVE_ARMV8_CRC)
+  const char *ret = crc32c_aarch64_available();
+  if (ret)
+    return ret ;
+#elif HAVE_SSE42
+  if (isSSE42())
+  {
+    if (SIZEOF_SIZE_T == 8 && isPCLMULQDQ())
+      return "Using crc32 + pclmulqdq instructions";
+    return "Using SSE4.2 crc32 instructions";
+  }
+#endif
+  return "Using generic crc32 instructions";
+}
+
+
+/*
+ * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
+ *  This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the author be held liable for any damages
+ * arising from the use of this software.
+ *  Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *  1. The origin of this software must not be misrepresented; you must not
+ *   claim that you wrote the original software. If you use this software
+ *   in a product, an acknowledgment in the product documentation would be
+ *   appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *   misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *  Ferry Toth
+ * ftoth@exalondelft.nl
+ *
+ * https://github.com/htot/crc32c
+ *
+ * Modified by Facebook
+ *
+ * Original intel whitepaper:
+ * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
+ * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ *
+ * This version is from the folly library, created by Dave Watson <davejwatson@fb.com>
+ *
+*/
+#if defined HAVE_SSE42 && defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8
+
+
+#define CRCtriplet(crc, buf, offset)                  \
+  crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
+  crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \
+  crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset));
+
+#define CRCduplet(crc, buf, offset)                   \
+  crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
+  crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset));
+
+#define CRCsinglet(crc, buf, offset)                    \
+  crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset));
+
+
+// Numbers taken directly from intel whitepaper.
+// clang-format off
+static const uint64_t clmul_constants[] = {
+    0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6,
+    0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e,
+    0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da,
+    0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8,
+    0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296,
+    0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2,
+    0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6,
+    0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092,
+    0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0,
+    0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456,
+    0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e,
+    0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a,
+    0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574,
+    0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832,
+    0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124,
+    0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86,
+    0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e,
+    0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a,
+    0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46,
+    0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a,
+    0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a,
+    0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4,
+    0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56,
+    0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2,
+    0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c,
+    0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac,
+    0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64,
+    0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e,
+    0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c,
+    0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28,
+    0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26,
+    0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c,
+    0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c,
+    0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c,
+    0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4,
+    0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844,
+    0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c,
+    0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730,
+    0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c,
+    0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2,
+    0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2,
+    0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e,
+    0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a,
+    0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a,
+    0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a,
+    0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768,
+    0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4,
+    0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c,
+    0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba,
+    0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312,
+    0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544,
+    0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a,
+    0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e,
+    0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a,
+    0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c,
+    0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a,
+    0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6,
+    0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca,
+    0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888,
+    0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e,
+    0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528,
+    0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a,
+    0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e,
+    0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa,
+};
+
+// Compute the crc32c value for buffer smaller than 8
+static inline void align_to_8(
+    size_t len,
+    uint64_t& crc0, // crc so far, updated on return
+    const unsigned char*& next) { // next data pointer, updated on return
+  uint32_t crc32bit = static_cast<uint32_t>(crc0);
+  if (len & 0x04) {
+    crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next);
+    next += sizeof(uint32_t);
+  }
+  if (len & 0x02) {
+    crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next);
+    next += sizeof(uint16_t);
+  }
+  if (len & 0x01) {
+    crc32bit = _mm_crc32_u8(crc32bit, *(next));
+    next++;
+  }
+  crc0 = crc32bit;
+}
+
+//
+// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
+// chosen constant and xor's these with the remaining CRC.
+//
+static inline uint64_t CombineCRC(
+    size_t block_size,
+    uint64_t crc0,
+    uint64_t crc1,
+    uint64_t crc2,
+    const uint64_t* next2) {
+  const auto multiplier =
+      *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1);
+  const auto crc0_xmm = _mm_set_epi64x(0, crc0);
+  const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00);
+  const auto crc1_xmm = _mm_set_epi64x(0, crc1);
+  const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10);
+  const auto res = _mm_xor_si128(res0, res1);
+  crc0 = _mm_cvtsi128_si64(res);
+  crc0 = crc0 ^ *((uint64_t*)next2 - 1);
+  crc2 = _mm_crc32_u64(crc2, crc0);
+  return crc2;
+}
+
+// Compute CRC-32C using the Intel hardware instruction.
+static inline uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) {
+  const unsigned char* next = (const unsigned char*)buf;
+  uint64_t count;
+  uint64_t crc0, crc1, crc2;
+  crc0 = crc ^ 0xffffffffu;
+
+
+  if (len >= 8) {
+    // if len > 216 then align and use triplets
+    if (len > 216) {
+      {
+        // Work on the bytes (< 8) before the first 8-byte alignment addr starts
+        auto align_bytes = (8 - (uintptr_t)next) & 7;
+        len -= align_bytes;
+        align_to_8(align_bytes, crc0, next);
+      }
+
+      // Now work on the remaining blocks
+      count = len / 24; // number of triplets
+      len %= 24; // bytes remaining
+      uint64_t n = count >> 7; // #blocks = first block + full blocks
+      uint64_t block_size = count & 127;
+      if (block_size == 0) {
+        block_size = 128;
+      } else {
+        n++;
+      }
+      // points to the first byte of the next block
+      const uint64_t* next0 = (uint64_t*)next + block_size;
+      const uint64_t* next1 = next0 + block_size;
+      const uint64_t* next2 = next1 + block_size;
+
+      crc1 = crc2 = 0;
+      // Use Duff's device, a for() loop inside a switch()
+      // statement. This needs to execute at least once, round len
+      // down to nearest triplet multiple
+      switch (block_size) {
+        case 128:
+          do {
+            // jumps here for a full block of len 128
+            CRCtriplet(crc, next, -128);
+              /* fallthrough */
+            case 127:
+              // jumps here or below for the first block smaller
+              CRCtriplet(crc, next, -127);
+              /* fallthrough */
+            case 126:
+              CRCtriplet(crc, next, -126); // than 128
+              /* fallthrough */
+            case 125:
+              CRCtriplet(crc, next, -125);
+              /* fallthrough */
+            case 124:
+              CRCtriplet(crc, next, -124);
+              /* fallthrough */
+            case 123:
+              CRCtriplet(crc, next, -123);
+              /* fallthrough */
+            case 122:
+              CRCtriplet(crc, next, -122);
+              /* fallthrough */
+            case 121:
+              CRCtriplet(crc, next, -121);
+              /* fallthrough */
+            case 120:
+              CRCtriplet(crc, next, -120);
+              /* fallthrough */
+            case 119:
+              CRCtriplet(crc, next, -119);
+              /* fallthrough */
+            case 118:
+              CRCtriplet(crc, next, -118);
+              /* fallthrough */
+            case 117:
+              CRCtriplet(crc, next, -117);
+              /* fallthrough */
+            case 116:
+              CRCtriplet(crc, next, -116);
+              /* fallthrough */
+            case 115:
+              CRCtriplet(crc, next, -115);
+              /* fallthrough */
+            case 114:
+              CRCtriplet(crc, next, -114);
+              /* fallthrough */
+            case 113:
+              CRCtriplet(crc, next, -113);
+              /* fallthrough */
+            case 112:
+              CRCtriplet(crc, next, -112);
+              /* fallthrough */
+            case 111:
+              CRCtriplet(crc, next, -111);
+              /* fallthrough */
+            case 110:
+              CRCtriplet(crc, next, -110);
+              /* fallthrough */
+            case 109:
+              CRCtriplet(crc, next, -109);
+              /* fallthrough */
+            case 108:
+              CRCtriplet(crc, next, -108);
+              /* fallthrough */
+            case 107:
+              CRCtriplet(crc, next, -107);
+              /* fallthrough */
+            case 106:
+              CRCtriplet(crc, next, -106);
+              /* fallthrough */
+            case 105:
+              CRCtriplet(crc, next, -105);
+              /* fallthrough */
+            case 104:
+              CRCtriplet(crc, next, -104);
+              /* fallthrough */
+            case 103:
+              CRCtriplet(crc, next, -103);
+              /* fallthrough */
+            case 102:
+              CRCtriplet(crc, next, -102);
+              /* fallthrough */
+            case 101:
+              CRCtriplet(crc, next, -101);
+              /* fallthrough */
+            case 100:
+              CRCtriplet(crc, next, -100);
+              /* fallthrough */
+            case 99:
+              CRCtriplet(crc, next, -99);
+              /* fallthrough */
+            case 98:
+              CRCtriplet(crc, next, -98);
+              /* fallthrough */
+            case 97:
+              CRCtriplet(crc, next, -97);
+              /* fallthrough */
+            case 96:
+              CRCtriplet(crc, next, -96);
+              /* fallthrough */
+            case 95:
+              CRCtriplet(crc, next, -95);
+              /* fallthrough */
+            case 94:
+              CRCtriplet(crc, next, -94);
+              /* fallthrough */
+            case 93:
+              CRCtriplet(crc, next, -93);
+              /* fallthrough */
+            case 92:
+              CRCtriplet(crc, next, -92);
+              /* fallthrough */
+            case 91:
+              CRCtriplet(crc, next, -91);
+              /* fallthrough */
+            case 90:
+              CRCtriplet(crc, next, -90);
+              /* fallthrough */
+            case 89:
+              CRCtriplet(crc, next, -89);
+              /* fallthrough */
+            case 88:
+              CRCtriplet(crc, next, -88);
+              /* fallthrough */
+            case 87:
+              CRCtriplet(crc, next, -87);
+              /* fallthrough */
+            case 86:
+              CRCtriplet(crc, next, -86);
+              /* fallthrough */
+            case 85:
+              CRCtriplet(crc, next, -85);
+              /* fallthrough */
+            case 84:
+              CRCtriplet(crc, next, -84);
+              /* fallthrough */
+            case 83:
+              CRCtriplet(crc, next, -83);
+              /* fallthrough */
+            case 82:
+              CRCtriplet(crc, next, -82);
+              /* fallthrough */
+            case 81:
+              CRCtriplet(crc, next, -81);
+              /* fallthrough */
+            case 80:
+              CRCtriplet(crc, next, -80);
+              /* fallthrough */
+            case 79:
+              CRCtriplet(crc, next, -79);
+              /* fallthrough */
+            case 78:
+              CRCtriplet(crc, next, -78);
+              /* fallthrough */
+            case 77:
+              CRCtriplet(crc, next, -77);
+              /* fallthrough */
+            case 76:
+              CRCtriplet(crc, next, -76);
+              /* fallthrough */
+            case 75:
+              CRCtriplet(crc, next, -75);
+              /* fallthrough */
+            case 74:
+              CRCtriplet(crc, next, -74);
+              /* fallthrough */
+            case 73:
+              CRCtriplet(crc, next, -73);
+              /* fallthrough */
+            case 72:
+              CRCtriplet(crc, next, -72);
+              /* fallthrough */
+            case 71:
+              CRCtriplet(crc, next, -71);
+              /* fallthrough */
+            case 70:
+              CRCtriplet(crc, next, -70);
+              /* fallthrough */
+            case 69:
+              CRCtriplet(crc, next, -69);
+              /* fallthrough */
+            case 68:
+              CRCtriplet(crc, next, -68);
+              /* fallthrough */
+            case 67:
+              CRCtriplet(crc, next, -67);
+              /* fallthrough */
+            case 66:
+              CRCtriplet(crc, next, -66);
+              /* fallthrough */
+            case 65:
+              CRCtriplet(crc, next, -65);
+              /* fallthrough */
+            case 64:
+              CRCtriplet(crc, next, -64);
+              /* fallthrough */
+            case 63:
+              CRCtriplet(crc, next, -63);
+              /* fallthrough */
+            case 62:
+              CRCtriplet(crc, next, -62);
+              /* fallthrough */
+            case 61:
+              CRCtriplet(crc, next, -61);
+              /* fallthrough */
+            case 60:
+              CRCtriplet(crc, next, -60);
+              /* fallthrough */
+            case 59:
+              CRCtriplet(crc, next, -59);
+              /* fallthrough */
+            case 58:
+              CRCtriplet(crc, next, -58);
+              /* fallthrough */
+            case 57:
+              CRCtriplet(crc, next, -57);
+              /* fallthrough */
+            case 56:
+              CRCtriplet(crc, next, -56);
+              /* fallthrough */
+            case 55:
+              CRCtriplet(crc, next, -55);
+              /* fallthrough */
+            case 54:
+              CRCtriplet(crc, next, -54);
+              /* fallthrough */
+            case 53:
+              CRCtriplet(crc, next, -53);
+              /* fallthrough */
+            case 52:
+              CRCtriplet(crc, next, -52);
+              /* fallthrough */
+            case 51:
+              CRCtriplet(crc, next, -51);
+              /* fallthrough */
+            case 50:
+              CRCtriplet(crc, next, -50);
+              /* fallthrough */
+            case 49:
+              CRCtriplet(crc, next, -49);
+              /* fallthrough */
+            case 48:
+              CRCtriplet(crc, next, -48);
+              /* fallthrough */
+            case 47:
+              CRCtriplet(crc, next, -47);
+              /* fallthrough */
+            case 46:
+              CRCtriplet(crc, next, -46);
+              /* fallthrough */
+            case 45:
+              CRCtriplet(crc, next, -45);
+              /* fallthrough */
+            case 44:
+              CRCtriplet(crc, next, -44);
+              /* fallthrough */
+            case 43:
+              CRCtriplet(crc, next, -43);
+              /* fallthrough */
+            case 42:
+              CRCtriplet(crc, next, -42);
+              /* fallthrough */
+            case 41:
+              CRCtriplet(crc, next, -41);
+              /* fallthrough */
+            case 40:
+              CRCtriplet(crc, next, -40);
+              /* fallthrough */
+            case 39:
+              CRCtriplet(crc, next, -39);
+              /* fallthrough */
+            case 38:
+              CRCtriplet(crc, next, -38);
+              /* fallthrough */
+            case 37:
+              CRCtriplet(crc, next, -37);
+              /* fallthrough */
+            case 36:
+              CRCtriplet(crc, next, -36);
+              /* fallthrough */
+            case 35:
+              CRCtriplet(crc, next, -35);
+              /* fallthrough */
+            case 34:
+              CRCtriplet(crc, next, -34);
+              /* fallthrough */
+            case 33:
+              CRCtriplet(crc, next, -33);
+              /* fallthrough */
+            case 32:
+              CRCtriplet(crc, next, -32);
+              /* fallthrough */
+            case 31:
+              CRCtriplet(crc, next, -31);
+              /* fallthrough */
+            case 30:
+              CRCtriplet(crc, next, -30);
+              /* fallthrough */
+            case 29:
+              CRCtriplet(crc, next, -29);
+              /* fallthrough */
+            case 28:
+              CRCtriplet(crc, next, -28);
+              /* fallthrough */
+            case 27:
+              CRCtriplet(crc, next, -27);
+              /* fallthrough */
+            case 26:
+              CRCtriplet(crc, next, -26);
+              /* fallthrough */
+            case 25:
+              CRCtriplet(crc, next, -25);
+              /* fallthrough */
+            case 24:
+              CRCtriplet(crc, next, -24);
+              /* fallthrough */
+            case 23:
+              CRCtriplet(crc, next, -23);
+              /* fallthrough */
+            case 22:
+              CRCtriplet(crc, next, -22);
+              /* fallthrough */
+            case 21:
+              CRCtriplet(crc, next, -21);
+              /* fallthrough */
+            case 20:
+              CRCtriplet(crc, next, -20);
+              /* fallthrough */
+            case 19:
+              CRCtriplet(crc, next, -19);
+              /* fallthrough */
+            case 18:
+              CRCtriplet(crc, next, -18);
+              /* fallthrough */
+            case 17:
+              CRCtriplet(crc, next, -17);
+              /* fallthrough */
+            case 16:
+              CRCtriplet(crc, next, -16);
+              /* fallthrough */
+            case 15:
+              CRCtriplet(crc, next, -15);
+              /* fallthrough */
+            case 14:
+              CRCtriplet(crc, next, -14);
+              /* fallthrough */
+            case 13:
+              CRCtriplet(crc, next, -13);
+              /* fallthrough */
+            case 12:
+              CRCtriplet(crc, next, -12);
+              /* fallthrough */
+            case 11:
+              CRCtriplet(crc, next, -11);
+              /* fallthrough */
+            case 10:
+              CRCtriplet(crc, next, -10);
+              /* fallthrough */
+            case 9:
+              CRCtriplet(crc, next, -9);
+              /* fallthrough */
+            case 8:
+              CRCtriplet(crc, next, -8);
+              /* fallthrough */
+            case 7:
+              CRCtriplet(crc, next, -7);
+              /* fallthrough */
+            case 6:
+              CRCtriplet(crc, next, -6);
+              /* fallthrough */
+            case 5:
+              CRCtriplet(crc, next, -5);
+              /* fallthrough */
+            case 4:
+              CRCtriplet(crc, next, -4);
+              /* fallthrough */
+            case 3:
+              CRCtriplet(crc, next, -3);
+              /* fallthrough */
+            case 2:
+              CRCtriplet(crc, next, -2);
+              /* fallthrough */
+            case 1:
+              CRCduplet(crc, next, -1); // the final triplet is actually only 2
+              //{ CombineCRC(); }
+              crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2);
+              if (--n > 0) {
+                crc1 = crc2 = 0;
+                block_size = 128;
+                // points to the first byte of the next block
+                next0 = next2 + 128;
+                next1 = next0 + 128; // from here on all blocks are 128 long
+                next2 = next1 + 128;
+              }
+              /* fallthrough */
+            case 0:;
+          } while (n > 0);
+      }
+      next = (const unsigned char*)next2;
+    }
+    uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets
+    len = len & 7;
+    next += (count2 * 8);
+    switch (count2) {
+      case 27:
+        CRCsinglet(crc0, next, -27 * 8);
+        /* fallthrough */
+      case 26:
+        CRCsinglet(crc0, next, -26 * 8);
+        /* fallthrough */
+      case 25:
+        CRCsinglet(crc0, next, -25 * 8);
+        /* fallthrough */
+      case 24:
+        CRCsinglet(crc0, next, -24 * 8);
+        /* fallthrough */
+      case 23:
+        CRCsinglet(crc0, next, -23 * 8);
+        /* fallthrough */
+      case 22:
+        CRCsinglet(crc0, next, -22 * 8);
+        /* fallthrough */
+      case 21:
+        CRCsinglet(crc0, next, -21 * 8);
+        /* fallthrough */
+      case 20:
+        CRCsinglet(crc0, next, -20 * 8);
+        /* fallthrough */
+      case 19:
+        CRCsinglet(crc0, next, -19 * 8);
+        /* fallthrough */
+      case 18:
+        CRCsinglet(crc0, next, -18 * 8);
+        /* fallthrough */
+      case 17:
+        CRCsinglet(crc0, next, -17 * 8);
+        /* fallthrough */
+      case 16:
+        CRCsinglet(crc0, next, -16 * 8);
+        /* fallthrough */
+      case 15:
+        CRCsinglet(crc0, next, -15 * 8);
+        /* fallthrough */
+      case 14:
+        CRCsinglet(crc0, next, -14 * 8);
+        /* fallthrough */
+      case 13:
+        CRCsinglet(crc0, next, -13 * 8);
+        /* fallthrough */
+      case 12:
+        CRCsinglet(crc0, next, -12 * 8);
+        /* fallthrough */
+      case 11:
+        CRCsinglet(crc0, next, -11 * 8);
+        /* fallthrough */
+      case 10:
+        CRCsinglet(crc0, next, -10 * 8);
+        /* fallthrough */
+      case 9:
+        CRCsinglet(crc0, next, -9 * 8);
+        /* fallthrough */
+      case 8:
+        CRCsinglet(crc0, next, -8 * 8);
+        /* fallthrough */
+      case 7:
+        CRCsinglet(crc0, next, -7 * 8);
+        /* fallthrough */
+      case 6:
+        CRCsinglet(crc0, next, -6 * 8);
+        /* fallthrough */
+      case 5:
+        CRCsinglet(crc0, next, -5 * 8);
+        /* fallthrough */
+      case 4:
+        CRCsinglet(crc0, next, -4 * 8);
+        /* fallthrough */
+      case 3:
+        CRCsinglet(crc0, next, -3 * 8);
+        /* fallthrough */
+      case 2:
+        CRCsinglet(crc0, next, -2 * 8);
+        /* fallthrough */
+      case 1:
+        CRCsinglet(crc0, next, -1 * 8);
+        /* fallthrough */
+      case 0:;
+    }
+  }
+  {
+    align_to_8(len, crc0, next);
+    return (uint32_t)crc0 ^ 0xffffffffu;
+  }
+}
+
+#else
+#define NO_THREEWAY_CRC32C
+#endif //HAVE_SSE42 && HAVE_PCLMUL
+
+static inline Function Choose_Extend() {
+#ifdef HAVE_POWER8
+  return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
+#elif defined(HAVE_ARMV8_CRC)
+  if(crc32c_aarch64_available()) {
+    return ExtendARMImpl;
+  } else {
+    return ExtendImpl<Slow_CRC32>;
+  }
+#else
+  if (isSSE42()) {
+    if (isPCLMULQDQ()) {
+#if defined HAVE_SSE42  && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C
+      return crc32c_3way;
+#else
+    return ExtendImpl<Fast_CRC32>; // Fast_CRC32 will check HAVE_SSE42 itself
+#endif
+    }
+    else {  // no runtime PCLMULQDQ support but has SSE42 support
+      return ExtendImpl<Fast_CRC32>;
+    }
+  } // end of isSSE42()
+  else {
+    return ExtendImpl<Slow_CRC32>;
+  }
+#endif
+}
+
+static const Function ChosenExtend = Choose_Extend();
+
+static inline uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
+  return ChosenExtend(crc, buf, size);
+}
+}  // namespace crc32c
+}  // namespace mysys_namespace
+
+extern "C" unsigned int my_crc32c(unsigned int crc, const char *buf, size_t size)
+{
+  return mysys_namespace::crc32c::Extend(crc,buf, size);
+}
diff --git a/mysys/crc32/crc32c_ppc.c b/mysys/crc32/crc32c_ppc.c
new file mode 100644
index 00000000000..72f24283454
--- /dev/null
+++ b/mysys/crc32/crc32c_ppc.c
@@ -0,0 +1,5 @@
+#define CRC32_FUNCTION crc32c_ppc
+#define CRC_TABLE
+#define POWER8_INTRINSICS
+#include "pcc_crc32c_constants.h"
+#include "crc_ppc64.h"
diff --git a/mysys/crc32/crc32c_ppc.h b/mysys/crc32/crc32c_ppc.h
new file mode 100644
index 00000000000..c359061c610
--- /dev/null
+++ b/mysys/crc32/crc32c_ppc.h
@@ -0,0 +1,19 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer,
+                           unsigned len);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mysys/crc32/crc_ppc64.h b/mysys/crc32/crc_ppc64.h
new file mode 100644
index 00000000000..eb9379abc6c
--- /dev/null
+++ b/mysys/crc32/crc_ppc64.h
@@ -0,0 +1,664 @@
+/*
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * This code uses gcc vector builtins instead using assembly directly.
+ *
+ * Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ *  a) the GNU General Public License as published by the Free Software
+ *     Foundation; either version 2 of the License, or (at your option)
+ *     any later version, or
+ *  b) the Apache License, Version 2.0
+ */
+
+#include <altivec.h>
+
+
+#define VMX_ALIGN	16
+#define VMX_ALIGN_MASK	(VMX_ALIGN-1)
+
+#ifdef REFLECT
+static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
+			       unsigned long len)
+{
+	while (len--)
+		crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+	return crc;
+}
+#else
+static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
+				unsigned long len)
+{
+	while (len--)
+		crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8);
+	return crc;
+}
+#endif
+
+static unsigned int __attribute__ ((aligned (32)))
+__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
+
+
+unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p,
+			    unsigned long len)
+{
+	unsigned int prealign;
+	unsigned int tail;
+
+#ifdef CRC_XOR
+	crc ^= 0xffffffff;
+#endif
+
+	if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+		crc = crc32_align(crc, p, len);
+		goto out;
+	}
+
+	if ((unsigned long)p & VMX_ALIGN_MASK) {
+		prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+		crc = crc32_align(crc, p, prealign);
+		len -= prealign;
+		p += prealign;
+	}
+
+	crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+
+	tail = len & VMX_ALIGN_MASK;
+	if (tail) {
+		p += len & ~VMX_ALIGN_MASK;
+		crc = crc32_align(crc, p, tail);
+	}
+
+out:
+#ifdef CRC_XOR
+	crc ^= 0xffffffff;
+#endif
+
+	return crc;
+}
+
+#if defined (__clang__)
+#include "clang_workaround.h"
+#else
+#define __builtin_pack_vector(a, b)  __builtin_pack_vector_int128 ((a), (b))
+#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0)
+#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1)
+#endif
+
+/* When we have a load-store in a single-dispatch group and address overlap
+ * such that foward is not allowed (load-hit-store) the group must be flushed.
+ * A group ending NOP prevents the flush.
+ */
+#define GROUP_ENDING_NOP asm("ori 2,2,0" ::: "memory")
+
+#if defined(__BIG_ENDIAN__) && defined (REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#endif
+
+#ifdef BYTESWAP_DATA
+#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb,\
+			(__vector unsigned char) vc)
+#if defined(__LITTLE_ENDIAN__)
+/* Byte reverse permute constant LE. */
+static const __vector unsigned long long vperm_const
+	__attribute__ ((aligned(16))) = { 0x08090A0B0C0D0E0FUL,
+			0x0001020304050607UL };
+#else
+static const __vector unsigned long long vperm_const
+	__attribute__ ((aligned(16))) = { 0x0F0E0D0C0B0A0908UL,
+			0X0706050403020100UL };
+#endif
+#else
+#define VEC_PERM(vr, va, vb, vc)
+#endif
+
+static unsigned int __attribute__ ((aligned (32)))
+__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
+
+	const __vector unsigned long long vzero = {0,0};
+	const __vector unsigned long long vones = {0xffffffffffffffffUL,
+		0xffffffffffffffffUL};
+
+#ifdef REFLECT
+	__vector unsigned char vsht_splat;
+	const __vector unsigned long long vmask_32bit =
+		(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
+			(__vector unsigned char)vones, 4);
+#endif
+
+	const __vector unsigned long long vmask_64bit =
+		(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
+			(__vector unsigned char)vones, 8);
+
+	__vector unsigned long long vcrc;
+
+	__vector unsigned long long vconst1, vconst2;
+
+	/* vdata0-vdata7 will contain our data (p). */
+	__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4,
+		vdata5, vdata6, vdata7;
+
+	/* v0-v7 will contain our checksums */
+	__vector unsigned long long v0 = {0,0};
+	__vector unsigned long long v1 = {0,0};
+	__vector unsigned long long v2 = {0,0};
+	__vector unsigned long long v3 = {0,0};
+	__vector unsigned long long v4 = {0,0};
+	__vector unsigned long long v5 = {0,0};
+	__vector unsigned long long v6 = {0,0};
+	__vector unsigned long long v7 = {0,0};
+
+
+	/* Vector auxiliary variables. */
+	__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
+
+	unsigned int result = 0;
+	unsigned int offset; /* Constant table offset. */
+
+	unsigned long i; /* Counter. */
+	unsigned long chunks;
+
+	unsigned long block_size;
+	int next_block = 0;
+
+	/* Align by 128 bits. The last 128 bit block will be processed at end. */
+	unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
+
+#ifdef REFLECT
+	vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc);
+#else
+	vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL);
+
+	/* Shift into top 32 bits */
+	vcrc = (__vector unsigned long long)vec_sld((__vector unsigned char)vcrc,
+        (__vector unsigned char)vzero, 4);
+#endif
+
+	/* Short version. */
+	if (len < 256) {
+		/* Calculate where in the constant table we need to start. */
+		offset = 256 - len;
+
+		vconst1 = vec_ld(offset, vcrc_short_const);
+		vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+		VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+
+		/* xor initial value*/
+		vdata0 = vec_xor(vdata0, vcrc);
+
+		vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
+				((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+		v0 = vec_xor(v0, vdata0);
+
+		for (i = 16; i < len; i += 16) {
+			vconst1 = vec_ld(offset + i, vcrc_short_const);
+			vdata0 = vec_ld(i, (__vector unsigned long long*) p);
+			VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+			vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
+				((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+			v0 = vec_xor(v0, vdata0);
+		}
+	} else {
+
+		/* Load initial values. */
+		vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+		vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+
+		VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+		VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+		vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+		vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+
+		VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+		VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+		vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+		vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+
+		VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+		VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+		vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+		vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+
+		VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+		VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+		/* xor in initial value */
+		vdata0 = vec_xor(vdata0, vcrc);
+
+		p = (char *)p + 128;
+
+		do {
+			/* Checksum in blocks of MAX_SIZE. */
+			block_size = length;
+			if (block_size > MAX_SIZE) {
+				block_size = MAX_SIZE;
+			}
+
+			length = length - block_size;
+
+			/*
+			* Work out the offset into the constants table to start at. Each
+			* constant is 16 bytes, and it is used against 128 bytes of input
+			* data - 128 / 16 = 8
+			*/
+			offset = (MAX_SIZE/8) - (block_size/8);
+			/* We reduce our final 128 bytes in a separate step */
+			chunks = (block_size/128)-1;
+
+		    vconst1 = vec_ld(offset, vcrc_const);
+
+			va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0,
+						(__vector unsigned long long)vconst1);
+			va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1,
+						(__vector unsigned long long)vconst1);
+			va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2,
+						(__vector unsigned long long)vconst1);
+			va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3,
+						(__vector unsigned long long)vconst1);
+			va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4,
+						(__vector unsigned long long)vconst1);
+			va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5,
+						(__vector unsigned long long)vconst1);
+			va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6,
+						(__vector unsigned long long)vconst1);
+			va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7,
+						(__vector unsigned long long)vconst1);
+
+			if (chunks > 1) {
+				offset += 16;
+				vconst2 = vec_ld(offset, vcrc_const);
+				GROUP_ENDING_NOP;
+
+				vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+				VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+
+				vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+				VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+				vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+				VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+
+				vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+				VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+				vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+				VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+
+				vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+				VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+				vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+				VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+
+				vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+				VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+				p = (char *)p + 128;
+
+				/*
+				 * main loop. We modulo schedule it such that it takes three
+				 * iterations to complete - first iteration load, second
+				 * iteration vpmsum, third iteration xor.
+				 */
+				for (i = 0; i < chunks-2; i++) {
+					vconst1 = vec_ld(offset, vcrc_const);
+					offset += 16;
+					GROUP_ENDING_NOP;
+
+					v0 = vec_xor(v0, va0);
+					va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata0, (__vector unsigned long long)vconst2);
+					vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+					VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v1 = vec_xor(v1, va1);
+					va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata1, (__vector unsigned long long)vconst2);
+					vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+					VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v2 = vec_xor(v2, va2);
+					va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata2, (__vector unsigned long long)vconst2);
+					vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+					VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v3 = vec_xor(v3, va3);
+					va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata3, (__vector unsigned long long)vconst2);
+					vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+					VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+					vconst2 = vec_ld(offset, vcrc_const);
+					GROUP_ENDING_NOP;
+
+					v4 = vec_xor(v4, va4);
+					va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata4, (__vector unsigned long long)vconst1);
+					vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+					VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v5 = vec_xor(v5, va5);
+					va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata5, (__vector unsigned long long)vconst1);
+					vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+					VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v6 = vec_xor(v6, va6);
+					va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata6, (__vector unsigned long long)vconst1);
+					vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+					VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+					GROUP_ENDING_NOP;
+
+					v7 = vec_xor(v7, va7);
+					va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata7, (__vector unsigned long long)vconst1);
+					vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+					VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+					p = (char *)p + 128;
+				}
+
+				/* First cool down*/
+				vconst1 = vec_ld(offset, vcrc_const);
+				offset += 16;
+
+				v0 = vec_xor(v0, va0);
+				va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata0, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v1 = vec_xor(v1, va1);
+				va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata1, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v2 = vec_xor(v2, va2);
+				va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata2, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v3 = vec_xor(v3, va3);
+				va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata3, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v4 = vec_xor(v4, va4);
+				va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata4, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v5 = vec_xor(v5, va5);
+				va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata5, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v6 = vec_xor(v6, va6);
+				va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata6, (__vector unsigned long long)vconst1);
+				GROUP_ENDING_NOP;
+
+				v7 = vec_xor(v7, va7);
+				va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
+							long)vdata7, (__vector unsigned long long)vconst1);
+			}/* else */
+
+			/* Second cool down. */
+			v0 = vec_xor(v0, va0);
+			v1 = vec_xor(v1, va1);
+			v2 = vec_xor(v2, va2);
+			v3 = vec_xor(v3, va3);
+			v4 = vec_xor(v4, va4);
+			v5 = vec_xor(v5, va5);
+			v6 = vec_xor(v6, va6);
+			v7 = vec_xor(v7, va7);
+
+#ifdef REFLECT
+			/*
+			 * vpmsumd produces a 96 bit result in the least significant bits
+			 * of the register. Since we are bit reflected we have to shift it
+			 * left 32 bits so it occupies the least significant bits in the
+			 * bit reflected domain.
+			 */
+			v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+					(__vector unsigned char)vzero, 4);
+			v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
+					(__vector unsigned char)vzero, 4);
+			v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
+					(__vector unsigned char)vzero, 4);
+			v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
+					(__vector unsigned char)vzero, 4);
+			v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
+					(__vector unsigned char)vzero, 4);
+			v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
+					(__vector unsigned char)vzero, 4);
+			v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
+					(__vector unsigned char)vzero, 4);
+			v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
+					(__vector unsigned char)vzero, 4);
+#endif
+
+			/* xor with the last 1024 bits. */
+			va0 = vec_ld(0, (__vector unsigned long long*) p);
+			VEC_PERM(va0, va0, va0, vperm_const);
+
+			va1 = vec_ld(16, (__vector unsigned long long*) p);
+			VEC_PERM(va1, va1, va1, vperm_const);
+
+			va2 = vec_ld(32, (__vector unsigned long long*) p);
+			VEC_PERM(va2, va2, va2, vperm_const);
+
+			va3 = vec_ld(48, (__vector unsigned long long*) p);
+			VEC_PERM(va3, va3, va3, vperm_const);
+
+			va4 = vec_ld(64, (__vector unsigned long long*) p);
+			VEC_PERM(va4, va4, va4, vperm_const);
+
+			va5 = vec_ld(80, (__vector unsigned long long*) p);
+			VEC_PERM(va5, va5, va5, vperm_const);
+
+			va6 = vec_ld(96, (__vector unsigned long long*) p);
+			VEC_PERM(va6, va6, va6, vperm_const);
+
+			va7 = vec_ld(112, (__vector unsigned long long*) p);
+			VEC_PERM(va7, va7, va7, vperm_const);
+
+			p = (char *)p + 128;
+
+			vdata0 = vec_xor(v0, va0);
+			vdata1 = vec_xor(v1, va1);
+			vdata2 = vec_xor(v2, va2);
+			vdata3 = vec_xor(v3, va3);
+			vdata4 = vec_xor(v4, va4);
+			vdata5 = vec_xor(v5, va5);
+			vdata6 = vec_xor(v6, va6);
+			vdata7 = vec_xor(v7, va7);
+
+			/* Check if we have more blocks to process */
+			next_block = 0;
+			if (length != 0) {
+				next_block = 1;
+
+			    /* zero v0-v7 */
+				v0 = vec_xor(v0, v0);
+				v1 = vec_xor(v1, v1);
+				v2 = vec_xor(v2, v2);
+				v3 = vec_xor(v3, v3);
+				v4 = vec_xor(v4, v4);
+				v5 = vec_xor(v5, v5);
+				v6 = vec_xor(v6, v6);
+				v7 = vec_xor(v7, v7);
+			}
+			length = length + 128;
+
+		} while (next_block);
+
+		/* Calculate how many bytes we have left. */
+		length = (len & 127);
+
+		/* Calculate where in (short) constant table we need to start. */
+		offset = 128 - length;
+
+		v0 = vec_ld(offset, vcrc_short_const);
+		v1 = vec_ld(offset + 16, vcrc_short_const);
+		v2 = vec_ld(offset + 32, vcrc_short_const);
+		v3 = vec_ld(offset + 48, vcrc_short_const);
+		v4 = vec_ld(offset + 64, vcrc_short_const);
+		v5 = vec_ld(offset + 80, vcrc_short_const);
+		v6 = vec_ld(offset + 96, vcrc_short_const);
+		v7 = vec_ld(offset + 112, vcrc_short_const);
+
+		offset += 128;
+
+		v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata0,(__vector unsigned int)v0);
+		v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata1,(__vector unsigned int)v1);
+		v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata2,(__vector unsigned int)v2);
+		v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata3,(__vector unsigned int)v3);
+		v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata4,(__vector unsigned int)v4);
+		v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata5,(__vector unsigned int)v5);
+		v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata6,(__vector unsigned int)v6);
+		v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata7,(__vector unsigned int)v7);
+
+		/* Now reduce the tail (0-112 bytes). */
+		for (i = 0; i < length; i+=16) {
+			vdata0 = vec_ld(i,(__vector unsigned long long*)p);
+			VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+			va0 = vec_ld(offset + i,vcrc_short_const);
+			va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
+			(__vector unsigned int)vdata0,(__vector unsigned int)va0);
+			v0 = vec_xor(v0, va0);
+		}
+
+		/* xor all parallel chunks together. */
+		v0 = vec_xor(v0, v1);
+		v2 = vec_xor(v2, v3);
+		v4 = vec_xor(v4, v5);
+		v6 = vec_xor(v6, v7);
+
+		v0 = vec_xor(v0, v2);
+		v4 = vec_xor(v4, v6);
+
+		v0 = vec_xor(v0, v4);
+	}
+
+	/* Barrett Reduction */
+	vconst1 = vec_ld(0, v_Barrett_const);
+	vconst2 = vec_ld(16, v_Barrett_const);
+
+	v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+			(__vector unsigned char)v0, 8);
+	v0 = vec_xor(v1,v0);
+
+#ifdef REFLECT
+	/* shift left one bit */
+	vsht_splat = vec_splat_u8 (1);
+	v0 = (__vector unsigned long long)vec_sll ((__vector unsigned char)v0,
+			vsht_splat);
+#endif
+
+	v0 = vec_and(v0, vmask_64bit);
+
+#ifndef REFLECT
+
+	/*
+	 * Now for the actual algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+
+	/* ma */
+	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0,
+			(__vector unsigned long long)vconst1);
+	/* q = floor(ma/(2^64)) */
+	v1 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero,
+			(__vector unsigned char)v1, 8);
+	/* qn */
+	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
+			(__vector unsigned long long)vconst2);
+	/* a - qn, subtraction is xor in GF(2) */
+	v0 = vec_xor (v0, v1);
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	result = __builtin_unpack_vector_1 (v0);
+#else
+
+	/*
+	 * The reflected version of Barrett reduction. Instead of bit
+	 * reflecting our data (which is expensive to do), we bit reflect our
+	 * constants and our algorithm, which means the intermediate data in
+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
+	 * the algorithm because we don't carry in mod 2 arithmetic.
+	 */
+
+	/* bottom 32 bits of a */
+	v1 = vec_and(v0, vmask_32bit);
+
+	/* ma */
+	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
+			(__vector unsigned long long)vconst1);
+
+	/* bottom 32bits of ma */
+	v1 = vec_and(v1, vmask_32bit);
+	/* qn */
+	v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
+			(__vector unsigned long long)vconst2);
+	/* a - qn, subtraction is xor in GF(2) */
+	v0 = vec_xor (v0, v1);
+
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
+	 * the high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+
+	/* shift result into top 64 bits of */
+	v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+        (__vector unsigned char)vzero, 4);
+
+	result = __builtin_unpack_vector_0 (v0);
+#endif
+
+	return result;
+}
diff --git a/mysys/checksum.c b/mysys/crc32ieee.cc
index 948b9be6164..5f8344b4f9d 100644
--- a/mysys/checksum.c
+++ b/mysys/crc32ieee.cc
@@ -18,41 +18,46 @@
 #include <my_sys.h>
 #include <zlib.h>
 
-#if !defined(HAVE_CRC32_VPMSUM)
 /* TODO: remove this once zlib adds inherent support for hardware accelerated
 crc32 for all architectures. */
 static unsigned int my_crc32_zlib(unsigned int crc, const void *data,
                                   size_t len)
 {
-  return (unsigned int) crc32(crc, data, (unsigned int) len);
+  return (unsigned int) crc32(crc, (const Bytef *)data, (unsigned int) len);
 }
 
-my_crc32_t my_checksum= my_crc32_zlib;
+#ifdef HAVE_PCLMUL
+extern "C" int crc32_pclmul_enabled();
+extern "C" unsigned int crc32_pclmul(unsigned int, const void *, size_t);
+#elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
+extern "C" int crc32_aarch64_available();
+extern "C" unsigned int crc32_aarch64(unsigned int, const void *, size_t);
 #endif
 
-#if __GNUC__ >= 4 && defined(__x86_64__)
 
-extern int crc32_pclmul_enabled();
-extern unsigned int crc32_pclmul(unsigned int, const void *, size_t);
+typedef unsigned int (*my_crc32_t)(unsigned int, const void *, size_t);
 
-/*----------------------------- x86_64 ---------------------------------*/
-void my_checksum_init(void)
+static my_crc32_t init_crc32()
 {
+  my_crc32_t func= my_crc32_zlib;
+#ifdef HAVE_PCLMUL
   if (crc32_pclmul_enabled())
-    my_checksum= crc32_pclmul;
-}
+    func = crc32_pclmul;
 #elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
-/*----------------------------- aarch64 --------------------------------*/
+  if (crc32_aarch64_available())
+    func= crc32_aarch64;
+#endif
+  return func;
+}
 
-extern unsigned int crc32_aarch64(unsigned int, const void *, size_t);
+static const my_crc32_t my_checksum_func= init_crc32();
 
-/* Ideally all ARM 64 bit processor should support crc32 but if some model
-doesn't support better to find it out through auxillary vector. */
-void my_checksum_init(void)
+#ifndef __powerpc64__
+/* For powerpc, my_checksum is defined elsewhere.*/
+extern "C" unsigned int my_checksum(unsigned int crc, const void *data, size_t len)
 {
-  if (crc32_aarch64_available())
-    my_checksum= crc32_aarch64;
+  return my_checksum_func(crc, data, len);
 }
-#else
-void my_checksum_init(void) {}
 #endif
+
+
diff --git a/mysys/mf_iocache.c b/mysys/mf_iocache.c
index 2e34cef5d19..75ff99b40a5 100644
--- a/mysys/mf_iocache.c
+++ b/mysys/mf_iocache.c
@@ -1,6 +1,6 @@
 /*
    Copyright (c) 2000, 2011, Oracle and/or its affiliates
-   Copyright (c) 2010, 2015, MariaDB
+   Copyright (c) 2010, 2020, MariaDB
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -769,7 +769,8 @@ int _my_b_cache_read(IO_CACHE *info, uchar *Buffer, size_t Count)
   info->read_pos=info->buffer+Count;
   info->read_end=info->buffer+length;
   info->pos_in_file=pos_in_file;
-  memcpy(Buffer, info->buffer, Count);
+  if (Count)
+    memcpy(Buffer, info->buffer, Count);
   DBUG_RETURN(0);
 }
 
@@ -1270,7 +1271,8 @@ static int _my_b_cache_read_r(IO_CACHE *cache, uchar *Buffer, size_t Count)
       DBUG_RETURN(1);
     }
     cnt= (len > Count) ? Count : len;
-    memcpy(Buffer, cache->read_pos, cnt);
+    if (cnt)
+      memcpy(Buffer, cache->read_pos, cnt);
     Count -= cnt;
     Buffer+= cnt;
     left_length+= cnt;
diff --git a/mysys/my_alloc.c b/mysys/my_alloc.c
index b4a63e93be3..d7e62726b22 100644
--- a/mysys/my_alloc.c
+++ b/mysys/my_alloc.c
@@ -1,5 +1,6 @@
 /*
    Copyright (c) 2000, 2010, Oracle and/or its affiliates
+   Copyright (c) 2010, 2020, MariaDB
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -465,7 +466,8 @@ char *strmake_root(MEM_ROOT *root, const char *str, size_t len)
   char *pos;
   if ((pos=alloc_root(root,len+1)))
   {
-    memcpy(pos,str,len);
+    if (len)
+      memcpy(pos,str,len);
     pos[len]=0;
   }
   return pos;
diff --git a/mysys/my_init.c b/mysys/my_init.c
index cd9875017f0..2b420da03be 100644
--- a/mysys/my_init.c
+++ b/mysys/my_init.c
@@ -100,9 +100,6 @@ my_bool my_init(void)
   /* Initialize our mutex handling */
   my_mutex_init();
 
-  /* Initialize CPU architecture specific hardware based crc32 optimization */
-  my_checksum_init();
-
   if (my_thread_global_init())
     return 1;
 
diff --git a/mysys/my_rename.c b/mysys/my_rename.c
index 9f0770e8140..7b31e83be20 100644
--- a/mysys/my_rename.c
+++ b/mysys/my_rename.c
@@ -19,8 +19,62 @@
 #include "m_string.h"
 #undef my_rename
 
-	/* On unix rename deletes to file if it exists */
 
+#ifdef _WIN32
+
+#define RENAME_MAX_RETRIES 50
+
+/*
+  On Windows, bad 3rd party programs (backup or anitivirus, or something else)
+  can have file open with a sharing mode incompatible with renaming, i.e they
+  won't use FILE_SHARE_DELETE when opening file.
+
+  The following function will do a couple of retries, in case MoveFileEx returns
+  ERROR_SHARING_VIOLATION.
+*/
+static BOOL win_rename_with_retries(const char *from, const char *to)
+{
+#ifndef DBUG_OFF
+  FILE *fp = NULL;
+  DBUG_EXECUTE_IF("rename_sharing_violation",
+    {
+    fp= fopen(from, "r");
+    DBUG_ASSERT(fp);
+    }
+  );
+#endif
+
+  for (int retry= RENAME_MAX_RETRIES; retry--;)
+  {
+    DWORD ret = MoveFileEx(from, to,
+                         MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING);
+
+    DBUG_ASSERT(fp == NULL || (ret == FALSE && GetLastError() == ERROR_SHARING_VIOLATION));
+
+    if (!ret && (GetLastError() == ERROR_SHARING_VIOLATION))
+    {
+#ifndef DBUG_OFF
+       /*
+        If error was injected in via DBUG_EXECUTE_IF, close the file
+        that is causing ERROR_SHARING_VIOLATION, so that retry succeeds.
+       */
+        if (fp)
+        {
+          fclose(fp);
+          fp= NULL;
+        }
+#endif
+
+      Sleep(10);
+    }
+    else
+      return ret;
+  }
+  return FALSE;
+}
+#endif
+
+	/* On unix rename deletes to file if it exists */
 int my_rename(const char *from, const char *to, myf MyFlags)
 {
   int error = 0;
@@ -28,8 +82,7 @@ int my_rename(const char *from, const char *to, myf MyFlags)
   DBUG_PRINT("my",("from %s to %s MyFlags %lu", from, to, MyFlags));
 
 #if defined(__WIN__)
-  if (!MoveFileEx(from, to, MOVEFILE_COPY_ALLOWED |
-                            MOVEFILE_REPLACE_EXISTING))
+  if (!win_rename_with_retries(from, to))
   {
     my_osmaperr(GetLastError());
 #elif defined(HAVE_RENAME)
author	Marko Mäkelä <marko.makela@mariadb.com>	2020-09-24 10:21:26 +0300
committer	Marko Mäkelä <marko.makela@mariadb.com>	2020-09-24 10:21:26 +0300
commit	6ce0a6f9ad77e7934e27db1b73d6d98064352928 (patch)
tree	351d7da0892c9a78310ffc39754c3ec4b38a188e /mysys
parent	b5c050563b1bfa1155b3b6a3b7c0c59775e77f13 (diff)
parent	882ce206dbf06b771ffe4cbce2e3e4214982f302 (diff)
download	mariadb-git-6ce0a6f9ad77e7934e27db1b73d6d98064352928.tar.gz