diff options
author | Marko Mäkelä <marko.makela@mariadb.com> | 2020-09-24 10:21:26 +0300 |
---|---|---|
committer | Marko Mäkelä <marko.makela@mariadb.com> | 2020-09-24 10:21:26 +0300 |
commit | 6ce0a6f9ad77e7934e27db1b73d6d98064352928 (patch) | |
tree | 351d7da0892c9a78310ffc39754c3ec4b38a188e /mysys | |
parent | b5c050563b1bfa1155b3b6a3b7c0c59775e77f13 (diff) | |
parent | 882ce206dbf06b771ffe4cbce2e3e4214982f302 (diff) | |
download | mariadb-git-6ce0a6f9ad77e7934e27db1b73d6d98064352928.tar.gz |
Merge 10.5 into 10.6
Diffstat (limited to 'mysys')
-rw-r--r-- | mysys/CMakeLists.txt | 58 | ||||
-rw-r--r-- | mysys/crc32/crc32_arm64.c | 19 | ||||
-rw-r--r-- | mysys/crc32/crc32_ppc64.c | 678 | ||||
-rw-r--r-- | mysys/crc32/crc32_x86.c | 811 | ||||
-rw-r--r-- | mysys/crc32/crc32c.cc | 1254 | ||||
-rw-r--r-- | mysys/crc32/crc32c_ppc.c | 5 | ||||
-rw-r--r-- | mysys/crc32/crc32c_ppc.h | 19 | ||||
-rw-r--r-- | mysys/crc32/crc_ppc64.h | 664 | ||||
-rw-r--r-- | mysys/crc32ieee.cc (renamed from mysys/checksum.c) | 43 | ||||
-rw-r--r-- | mysys/mf_iocache.c | 8 | ||||
-rw-r--r-- | mysys/my_alloc.c | 4 | ||||
-rw-r--r-- | mysys/my_init.c | 3 | ||||
-rw-r--r-- | mysys/my_rename.c | 59 |
13 files changed, 2395 insertions, 1230 deletions
diff --git a/mysys/CMakeLists.txt b/mysys/CMakeLists.txt index 3be4bc1b103..e7fd75b5359 100644 --- a/mysys/CMakeLists.txt +++ b/mysys/CMakeLists.txt @@ -16,7 +16,7 @@ INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/mysys) -SET(MYSYS_SOURCES array.c charset-def.c charset.c checksum.c my_default.c +SET(MYSYS_SOURCES array.c charset-def.c charset.c crc32ieee.cc my_default.c get_password.c errors.c hash.c list.c mf_cache.c mf_dirname.c mf_fn_ext.c @@ -45,7 +45,7 @@ SET(MYSYS_SOURCES array.c charset-def.c charset.c checksum.c my_default.c my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c ../sql-common/my_time.c my_rdtsc.c psi_noop.c my_atomic_writes.c my_cpu.c my_likely.c my_largepage.c - file_logger.c my_dlerror.c) + file_logger.c my_dlerror.c crc32/crc32c.cc) IF (WIN32) SET (MYSYS_SOURCES ${MYSYS_SOURCES} @@ -58,20 +58,24 @@ IF (WIN32) my_win_popen.cc) ENDIF() -IF(NOT MSVC AND CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") - #Check for PCLMUL instruction (x86) - CHECK_C_SOURCE_COMPILES(" - int main() - { - asm volatile (\"pclmulqdq \\$0x00, %%xmm1, %%xmm0\":::\"cc\"); - return 0; - }" HAVE_CLMUL_INSTRUCTION) - - IF(HAVE_CLMUL_INSTRUCTION) +IF(MSVC) + SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c) + ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL) + IF(CLANG_CL) + SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.cc crc32/crc32c.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") + ENDIF() +ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686") + MY_CHECK_C_COMPILER_FLAG(-msse4.2) + MY_CHECK_C_COMPILER_FLAG(-mpclmul) + CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H) + CHECK_INCLUDE_FILE(x86intrin.h HAVE_X86INTRIN_H) + IF(have_C__msse4.2 AND have_C__mpclmul AND HAVE_CPUID_H AND HAVE_X86INTRIN_H) SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c) + SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c crc32/crc32c.cc PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") + ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL) ENDIF() ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") - IF(CMAKE_COMPILER_IS_GNUCC AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) + IF(CMAKE_COMPILER_IS_GNUCC) include(CheckCXXSourceCompiles) CHECK_CXX_SOURCE_COMPILES(" @@ -93,23 +97,29 @@ ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") #include <sys/auxv.h> int main() { foo(0); getauxval(AT_HWCAP); }" HAVE_ARMV8_CRYPTO) - CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_INTRINSICS) - IF(HAVE_ARMV8_CRC_CRYPTO_INTRINSICS) + CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_MARCH) + + IF(HAVE_ARMV8_CRC_CRYPTO_MARCH) + CHECK_INCLUDE_FILE(arm_acle.h HAVE_ARM_ACLE_H -march=armv8-a+crc+crypto) + IF(HAVE_ARM_ACLE_H) + ADD_DEFINITIONS(-DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS) + ENDIF() + IF(HAVE_ARMV8_CRC) + ADD_DEFINITIONS(-DHAVE_ARMV8_CRC) + ENDIF() + IF(HAVE_ARMV8_CRYPTO) + ADD_DEFINITIONS(-DHAVE_ARMV8_CRYPTO) + ENDIF() SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_arm64.c) SET_SOURCE_FILES_PROPERTIES(crc32/crc32_arm64.c PROPERTIES COMPILE_FLAGS "-march=armv8-a+crc+crypto") ENDIF() ENDIF() ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64") - SET(HAVE_CRC32_VPMSUM 1 PARENT_SCOPE) - SET(MYSYS_SOURCES ${MYSYS_SOURCES} $<TARGET_OBJECTS:crc32c> $<TARGET_OBJECTS:crc32ieee>) - - ADD_LIBRARY(crc32c OBJECT crc32/crc32_ppc64.c) - ADD_LIBRARY(crc32ieee OBJECT crc32/crc32_ppc64.c) - - SET_TARGET_PROPERTIES(crc32c crc32ieee PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -maltivec -mvsx -mpower8-vector -mcrypto -mpower8-vector") - SET_TARGET_PROPERTIES(crc32ieee PROPERTIES COMPILE_DEFINITIONS "CRC32_FUNCTION=my_checksum;CRC32_CONSTANTS_HEADER=\"pcc_crc32_constants.h\"") - SET_TARGET_PROPERTIES(crc32c PROPERTIES COMPILE_DEFINITIONS "CRC32_FUNCTION=crc32c_vpmsum;CRC32_CONSTANTS_HEADER=\"pcc_crc32c_constants.h\"") + SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_ppc64.c crc32/crc32c_ppc.c) + SET_SOURCE_FILES_PROPERTIES(crc32/crc32_ppc64.c crc32/crc32c_ppc.c PROPERTIES + COMPILE_FLAGS "${COMPILE_FLAGS} -maltivec -mvsx -mpower8-vector -mcrypto -mpower8-vector") + ADD_DEFINITIONS(-DHAVE_POWER8 -DHAS_ALTIVEC) ENDIF() IF(UNIX) diff --git a/mysys/crc32/crc32_arm64.c b/mysys/crc32/crc32_arm64.c index a7eb2a47442..b82d4701e6f 100644 --- a/mysys/crc32/crc32_arm64.c +++ b/mysys/crc32/crc32_arm64.c @@ -57,6 +57,12 @@ asm(".arch_extension crypto"); #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) + + #define CRC32C3X8(buffer, ITR) \ __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\ __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\ @@ -73,6 +79,11 @@ asm(".arch_extension crypto"); #define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value)) #define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value)) +#define CRC32X(crc, value) (crc) = __crc32d((crc), (value)) +#define CRC32W(crc, value) (crc) = __crc32w((crc), (value)) +#define CRC32H(crc, value) (crc) = __crc32h((crc), (value)) +#define CRC32B(crc, value) (crc) = __crc32b((crc), (value)) + #define CRC32C3X8(buffer, ITR) \ crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\ crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\ @@ -119,7 +130,7 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) uint32_t crc0, crc1, crc2; int64_t length= (int64_t)len; - crc= 0xFFFFFFFFU; + crc^= 0xffffffff; /* Pmull runtime check here. * Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030). @@ -282,16 +293,16 @@ unsigned int crc32_aarch64(unsigned int crc, const void *buf, size_t len) /* if start pointer is not 8 bytes aligned */ while ((buf1 != (const uint8_t *) buf8) && len) { - crc= __crc32b(crc, *buf1++); + CRC32B(crc, *buf1++); len--; } for (; len >= 8; len-= 8) - crc= __crc32d(crc, *buf8++); + CRC32X(crc, *buf8++); buf1= (const uint8_t *) buf8; while (len--) - crc= __crc32b(crc, *buf1++); + CRC32B(crc, *buf1++); return ~crc; } diff --git a/mysys/crc32/crc32_ppc64.c b/mysys/crc32/crc32_ppc64.c index 2e8b9fc1b12..76df88ee231 100644 --- a/mysys/crc32/crc32_ppc64.c +++ b/mysys/crc32/crc32_ppc64.c @@ -1,675 +1,5 @@ -/* - * Calculate the checksum of data that is 16 byte aligned and a multiple of - * 16 bytes. - * - * The first step is to reduce it to 1024 bits. We do this in 8 parallel - * chunks in order to mask the latency of the vpmsum instructions. If we - * have more than 32 kB of data to checksum we repeat this step multiple - * times, passing in the previous 1024 bits. - * - * The next step is to reduce the 1024 bits to 64 bits. This step adds - * 32 bits of 0s to the end - this matches what a CRC does. We just - * calculate constants that land the data in this 32 bits. - * - * We then use fixed point Barrett reduction to compute a mod n over GF(2) - * for n = CRC using POWER8 instructions. We use x = 32. - * - * http://en.wikipedia.org/wiki/Barrett_reduction - * - * This code uses gcc vector builtins instead using assembly directly. - * - * Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of either: - * - * a) the GNU General Public License as published by the Free Software - * Foundation; either version 2 of the License, or (at your option) - * any later version, or - * b) the Apache License, Version 2.0 - */ - -#include <altivec.h> - -#define POWER8_INTRINSICS +#define CRC32_FUNCTION my_checksum #define CRC_TABLE - -#ifdef CRC32_CONSTANTS_HEADER -#include CRC32_CONSTANTS_HEADER -#else -#include "crc32_constants.h" -#endif - -#define VMX_ALIGN 16 -#define VMX_ALIGN_MASK (VMX_ALIGN-1) - -#ifdef REFLECT -static unsigned int crc32_align(unsigned int crc, const unsigned char *p, - unsigned long len) -{ - while (len--) - crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); - return crc; -} -#else -static unsigned int crc32_align(unsigned int crc, const unsigned char *p, - unsigned long len) -{ - while (len--) - crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8); - return crc; -} -#endif - -static unsigned int __attribute__ ((aligned (32))) -__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len); - -#ifndef CRC32_FUNCTION -#define CRC32_FUNCTION crc32_vpmsum -#endif - -unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p, - unsigned long len) -{ - unsigned int prealign; - unsigned int tail; - -#ifdef CRC_XOR - crc ^= 0xffffffff; -#endif - - if (len < VMX_ALIGN + VMX_ALIGN_MASK) { - crc = crc32_align(crc, p, len); - goto out; - } - - if ((unsigned long)p & VMX_ALIGN_MASK) { - prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); - crc = crc32_align(crc, p, prealign); - len -= prealign; - p += prealign; - } - - crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); - - tail = len & VMX_ALIGN_MASK; - if (tail) { - p += len & ~VMX_ALIGN_MASK; - crc = crc32_align(crc, p, tail); - } - -out: -#ifdef CRC_XOR - crc ^= 0xffffffff; -#endif - - return crc; -} - -#if defined (__clang__) -#include "clang_workaround.h" -#else -#define __builtin_pack_vector(a, b) __builtin_pack_vector_int128 ((a), (b)) -#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0) -#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1) -#endif - -/* When we have a load-store in a single-dispatch group and address overlap - * such that foward is not allowed (load-hit-store) the group must be flushed. - * A group ending NOP prevents the flush. - */ -#define GROUP_ENDING_NOP asm("ori 2,2,0" ::: "memory") - -#if defined(__BIG_ENDIAN__) && defined (REFLECT) -#define BYTESWAP_DATA -#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) -#define BYTESWAP_DATA -#endif - -#ifdef BYTESWAP_DATA -#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb,\ - (__vector unsigned char) vc) -#if defined(__LITTLE_ENDIAN__) -/* Byte reverse permute constant LE. */ -static const __vector unsigned long long vperm_const - __attribute__ ((aligned(16))) = { 0x08090A0B0C0D0E0FUL, - 0x0001020304050607UL }; -#else -static const __vector unsigned long long vperm_const - __attribute__ ((aligned(16))) = { 0x0F0E0D0C0B0A0908UL, - 0X0706050403020100UL }; -#endif -#else -#define VEC_PERM(vr, va, vb, vc) -#endif - -static unsigned int __attribute__ ((aligned (32))) -__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) { - - const __vector unsigned long long vzero = {0,0}; - const __vector unsigned long long vones = {0xffffffffffffffffUL, - 0xffffffffffffffffUL}; - -#ifdef REFLECT - __vector unsigned char vsht_splat; - const __vector unsigned long long vmask_32bit = - (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, - (__vector unsigned char)vones, 4); -#endif - - const __vector unsigned long long vmask_64bit = - (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, - (__vector unsigned char)vones, 8); - - __vector unsigned long long vcrc; - - __vector unsigned long long vconst1, vconst2; - - /* vdata0-vdata7 will contain our data (p). */ - __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, - vdata5, vdata6, vdata7; - - /* v0-v7 will contain our checksums */ - __vector unsigned long long v0 = {0,0}; - __vector unsigned long long v1 = {0,0}; - __vector unsigned long long v2 = {0,0}; - __vector unsigned long long v3 = {0,0}; - __vector unsigned long long v4 = {0,0}; - __vector unsigned long long v5 = {0,0}; - __vector unsigned long long v6 = {0,0}; - __vector unsigned long long v7 = {0,0}; - - - /* Vector auxiliary variables. */ - __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7; - - unsigned int result = 0; - unsigned int offset; /* Constant table offset. */ - - unsigned long i; /* Counter. */ - unsigned long chunks; - - unsigned long block_size; - int next_block = 0; - - /* Align by 128 bits. The last 128 bit block will be processed at end. */ - unsigned long length = len & 0xFFFFFFFFFFFFFF80UL; - -#ifdef REFLECT - vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc); -#else - vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL); - - /* Shift into top 32 bits */ - vcrc = (__vector unsigned long long)vec_sld((__vector unsigned char)vcrc, - (__vector unsigned char)vzero, 4); -#endif - - /* Short version. */ - if (len < 256) { - /* Calculate where in the constant table we need to start. */ - offset = 256 - len; - - vconst1 = vec_ld(offset, vcrc_short_const); - vdata0 = vec_ld(0, (__vector unsigned long long*) p); - VEC_PERM(vdata0, vdata0, vconst1, vperm_const); - - /* xor initial value*/ - vdata0 = vec_xor(vdata0, vcrc); - - vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw - ((__vector unsigned int)vdata0, (__vector unsigned int)vconst1); - v0 = vec_xor(v0, vdata0); - - for (i = 16; i < len; i += 16) { - vconst1 = vec_ld(offset + i, vcrc_short_const); - vdata0 = vec_ld(i, (__vector unsigned long long*) p); - VEC_PERM(vdata0, vdata0, vconst1, vperm_const); - vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw - ((__vector unsigned int)vdata0, (__vector unsigned int)vconst1); - v0 = vec_xor(v0, vdata0); - } - } else { - - /* Load initial values. */ - vdata0 = vec_ld(0, (__vector unsigned long long*) p); - vdata1 = vec_ld(16, (__vector unsigned long long*) p); - - VEC_PERM(vdata0, vdata0, vdata0, vperm_const); - VEC_PERM(vdata1, vdata1, vdata1, vperm_const); - - vdata2 = vec_ld(32, (__vector unsigned long long*) p); - vdata3 = vec_ld(48, (__vector unsigned long long*) p); - - VEC_PERM(vdata2, vdata2, vdata2, vperm_const); - VEC_PERM(vdata3, vdata3, vdata3, vperm_const); - - vdata4 = vec_ld(64, (__vector unsigned long long*) p); - vdata5 = vec_ld(80, (__vector unsigned long long*) p); - - VEC_PERM(vdata4, vdata4, vdata4, vperm_const); - VEC_PERM(vdata5, vdata5, vdata5, vperm_const); - - vdata6 = vec_ld(96, (__vector unsigned long long*) p); - vdata7 = vec_ld(112, (__vector unsigned long long*) p); - - VEC_PERM(vdata6, vdata6, vdata6, vperm_const); - VEC_PERM(vdata7, vdata7, vdata7, vperm_const); - - /* xor in initial value */ - vdata0 = vec_xor(vdata0, vcrc); - - p = (char *)p + 128; - - do { - /* Checksum in blocks of MAX_SIZE. */ - block_size = length; - if (block_size > MAX_SIZE) { - block_size = MAX_SIZE; - } - - length = length - block_size; - - /* - * Work out the offset into the constants table to start at. Each - * constant is 16 bytes, and it is used against 128 bytes of input - * data - 128 / 16 = 8 - */ - offset = (MAX_SIZE/8) - (block_size/8); - /* We reduce our final 128 bytes in a separate step */ - chunks = (block_size/128)-1; - - vconst1 = vec_ld(offset, vcrc_const); - - va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0, - (__vector unsigned long long)vconst1); - va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1, - (__vector unsigned long long)vconst1); - va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2, - (__vector unsigned long long)vconst1); - va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3, - (__vector unsigned long long)vconst1); - va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4, - (__vector unsigned long long)vconst1); - va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5, - (__vector unsigned long long)vconst1); - va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6, - (__vector unsigned long long)vconst1); - va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7, - (__vector unsigned long long)vconst1); - - if (chunks > 1) { - offset += 16; - vconst2 = vec_ld(offset, vcrc_const); - GROUP_ENDING_NOP; - - vdata0 = vec_ld(0, (__vector unsigned long long*) p); - VEC_PERM(vdata0, vdata0, vdata0, vperm_const); - - vdata1 = vec_ld(16, (__vector unsigned long long*) p); - VEC_PERM(vdata1, vdata1, vdata1, vperm_const); - - vdata2 = vec_ld(32, (__vector unsigned long long*) p); - VEC_PERM(vdata2, vdata2, vdata2, vperm_const); - - vdata3 = vec_ld(48, (__vector unsigned long long*) p); - VEC_PERM(vdata3, vdata3, vdata3, vperm_const); - - vdata4 = vec_ld(64, (__vector unsigned long long*) p); - VEC_PERM(vdata4, vdata4, vdata4, vperm_const); - - vdata5 = vec_ld(80, (__vector unsigned long long*) p); - VEC_PERM(vdata5, vdata5, vdata5, vperm_const); - - vdata6 = vec_ld(96, (__vector unsigned long long*) p); - VEC_PERM(vdata6, vdata6, vdata6, vperm_const); - - vdata7 = vec_ld(112, (__vector unsigned long long*) p); - VEC_PERM(vdata7, vdata7, vdata7, vperm_const); - - p = (char *)p + 128; - - /* - * main loop. We modulo schedule it such that it takes three - * iterations to complete - first iteration load, second - * iteration vpmsum, third iteration xor. - */ - for (i = 0; i < chunks-2; i++) { - vconst1 = vec_ld(offset, vcrc_const); - offset += 16; - GROUP_ENDING_NOP; - - v0 = vec_xor(v0, va0); - va0 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata0, (__vector unsigned long long)vconst2); - vdata0 = vec_ld(0, (__vector unsigned long long*) p); - VEC_PERM(vdata0, vdata0, vdata0, vperm_const); - GROUP_ENDING_NOP; - - v1 = vec_xor(v1, va1); - va1 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata1, (__vector unsigned long long)vconst2); - vdata1 = vec_ld(16, (__vector unsigned long long*) p); - VEC_PERM(vdata1, vdata1, vdata1, vperm_const); - GROUP_ENDING_NOP; - - v2 = vec_xor(v2, va2); - va2 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata2, (__vector unsigned long long)vconst2); - vdata2 = vec_ld(32, (__vector unsigned long long*) p); - VEC_PERM(vdata2, vdata2, vdata2, vperm_const); - GROUP_ENDING_NOP; - - v3 = vec_xor(v3, va3); - va3 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata3, (__vector unsigned long long)vconst2); - vdata3 = vec_ld(48, (__vector unsigned long long*) p); - VEC_PERM(vdata3, vdata3, vdata3, vperm_const); - - vconst2 = vec_ld(offset, vcrc_const); - GROUP_ENDING_NOP; - - v4 = vec_xor(v4, va4); - va4 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata4, (__vector unsigned long long)vconst1); - vdata4 = vec_ld(64, (__vector unsigned long long*) p); - VEC_PERM(vdata4, vdata4, vdata4, vperm_const); - GROUP_ENDING_NOP; - - v5 = vec_xor(v5, va5); - va5 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata5, (__vector unsigned long long)vconst1); - vdata5 = vec_ld(80, (__vector unsigned long long*) p); - VEC_PERM(vdata5, vdata5, vdata5, vperm_const); - GROUP_ENDING_NOP; - - v6 = vec_xor(v6, va6); - va6 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata6, (__vector unsigned long long)vconst1); - vdata6 = vec_ld(96, (__vector unsigned long long*) p); - VEC_PERM(vdata6, vdata6, vdata6, vperm_const); - GROUP_ENDING_NOP; - - v7 = vec_xor(v7, va7); - va7 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata7, (__vector unsigned long long)vconst1); - vdata7 = vec_ld(112, (__vector unsigned long long*) p); - VEC_PERM(vdata7, vdata7, vdata7, vperm_const); - - p = (char *)p + 128; - } - - /* First cool down*/ - vconst1 = vec_ld(offset, vcrc_const); - offset += 16; - - v0 = vec_xor(v0, va0); - va0 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata0, (__vector unsigned long long)vconst1); - GROUP_ENDING_NOP; - - v1 = vec_xor(v1, va1); - va1 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata1, (__vector unsigned long long)vconst1); - GROUP_ENDING_NOP; - - v2 = vec_xor(v2, va2); - va2 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata2, (__vector unsigned long long)vconst1); - GROUP_ENDING_NOP; - - v3 = vec_xor(v3, va3); - va3 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata3, (__vector unsigned long long)vconst1); - GROUP_ENDING_NOP; - - v4 = vec_xor(v4, va4); - va4 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata4, (__vector unsigned long long)vconst1); - GROUP_ENDING_NOP; - - v5 = vec_xor(v5, va5); - va5 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata5, (__vector unsigned long long)vconst1); - GROUP_ENDING_NOP; - - v6 = vec_xor(v6, va6); - va6 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata6, (__vector unsigned long long)vconst1); - GROUP_ENDING_NOP; - - v7 = vec_xor(v7, va7); - va7 = __builtin_crypto_vpmsumd ((__vector unsigned long - long)vdata7, (__vector unsigned long long)vconst1); - }/* else */ - - /* Second cool down. */ - v0 = vec_xor(v0, va0); - v1 = vec_xor(v1, va1); - v2 = vec_xor(v2, va2); - v3 = vec_xor(v3, va3); - v4 = vec_xor(v4, va4); - v5 = vec_xor(v5, va5); - v6 = vec_xor(v6, va6); - v7 = vec_xor(v7, va7); - -#ifdef REFLECT - /* - * vpmsumd produces a 96 bit result in the least significant bits - * of the register. Since we are bit reflected we have to shift it - * left 32 bits so it occupies the least significant bits in the - * bit reflected domain. - */ - v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, - (__vector unsigned char)vzero, 4); - v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1, - (__vector unsigned char)vzero, 4); - v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2, - (__vector unsigned char)vzero, 4); - v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3, - (__vector unsigned char)vzero, 4); - v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4, - (__vector unsigned char)vzero, 4); - v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5, - (__vector unsigned char)vzero, 4); - v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6, - (__vector unsigned char)vzero, 4); - v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7, - (__vector unsigned char)vzero, 4); -#endif - - /* xor with the last 1024 bits. */ - va0 = vec_ld(0, (__vector unsigned long long*) p); - VEC_PERM(va0, va0, va0, vperm_const); - - va1 = vec_ld(16, (__vector unsigned long long*) p); - VEC_PERM(va1, va1, va1, vperm_const); - - va2 = vec_ld(32, (__vector unsigned long long*) p); - VEC_PERM(va2, va2, va2, vperm_const); - - va3 = vec_ld(48, (__vector unsigned long long*) p); - VEC_PERM(va3, va3, va3, vperm_const); - - va4 = vec_ld(64, (__vector unsigned long long*) p); - VEC_PERM(va4, va4, va4, vperm_const); - - va5 = vec_ld(80, (__vector unsigned long long*) p); - VEC_PERM(va5, va5, va5, vperm_const); - - va6 = vec_ld(96, (__vector unsigned long long*) p); - VEC_PERM(va6, va6, va6, vperm_const); - - va7 = vec_ld(112, (__vector unsigned long long*) p); - VEC_PERM(va7, va7, va7, vperm_const); - - p = (char *)p + 128; - - vdata0 = vec_xor(v0, va0); - vdata1 = vec_xor(v1, va1); - vdata2 = vec_xor(v2, va2); - vdata3 = vec_xor(v3, va3); - vdata4 = vec_xor(v4, va4); - vdata5 = vec_xor(v5, va5); - vdata6 = vec_xor(v6, va6); - vdata7 = vec_xor(v7, va7); - - /* Check if we have more blocks to process */ - next_block = 0; - if (length != 0) { - next_block = 1; - - /* zero v0-v7 */ - v0 = vec_xor(v0, v0); - v1 = vec_xor(v1, v1); - v2 = vec_xor(v2, v2); - v3 = vec_xor(v3, v3); - v4 = vec_xor(v4, v4); - v5 = vec_xor(v5, v5); - v6 = vec_xor(v6, v6); - v7 = vec_xor(v7, v7); - } - length = length + 128; - - } while (next_block); - - /* Calculate how many bytes we have left. */ - length = (len & 127); - - /* Calculate where in (short) constant table we need to start. */ - offset = 128 - length; - - v0 = vec_ld(offset, vcrc_short_const); - v1 = vec_ld(offset + 16, vcrc_short_const); - v2 = vec_ld(offset + 32, vcrc_short_const); - v3 = vec_ld(offset + 48, vcrc_short_const); - v4 = vec_ld(offset + 64, vcrc_short_const); - v5 = vec_ld(offset + 80, vcrc_short_const); - v6 = vec_ld(offset + 96, vcrc_short_const); - v7 = vec_ld(offset + 112, vcrc_short_const); - - offset += 128; - - v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( - (__vector unsigned int)vdata0,(__vector unsigned int)v0); - v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( - (__vector unsigned int)vdata1,(__vector unsigned int)v1); - v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( - (__vector unsigned int)vdata2,(__vector unsigned int)v2); - v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( - (__vector unsigned int)vdata3,(__vector unsigned int)v3); - v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( - (__vector unsigned int)vdata4,(__vector unsigned int)v4); - v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( - (__vector unsigned int)vdata5,(__vector unsigned int)v5); - v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( - (__vector unsigned int)vdata6,(__vector unsigned int)v6); - v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( - (__vector unsigned int)vdata7,(__vector unsigned int)v7); - - /* Now reduce the tail (0-112 bytes). */ - for (i = 0; i < length; i+=16) { - vdata0 = vec_ld(i,(__vector unsigned long long*)p); - VEC_PERM(vdata0, vdata0, vdata0, vperm_const); - va0 = vec_ld(offset + i,vcrc_short_const); - va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( - (__vector unsigned int)vdata0,(__vector unsigned int)va0); - v0 = vec_xor(v0, va0); - } - - /* xor all parallel chunks together. */ - v0 = vec_xor(v0, v1); - v2 = vec_xor(v2, v3); - v4 = vec_xor(v4, v5); - v6 = vec_xor(v6, v7); - - v0 = vec_xor(v0, v2); - v4 = vec_xor(v4, v6); - - v0 = vec_xor(v0, v4); - } - - /* Barrett Reduction */ - vconst1 = vec_ld(0, v_Barrett_const); - vconst2 = vec_ld(16, v_Barrett_const); - - v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, - (__vector unsigned char)v0, 8); - v0 = vec_xor(v1,v0); - -#ifdef REFLECT - /* shift left one bit */ - vsht_splat = vec_splat_u8 (1); - v0 = (__vector unsigned long long)vec_sll ((__vector unsigned char)v0, - vsht_splat); -#endif - - v0 = vec_and(v0, vmask_64bit); - -#ifndef REFLECT - - /* - * Now for the actual algorithm. The idea is to calculate q, - * the multiple of our polynomial that we need to subtract. By - * doing the computation 2x bits higher (ie 64 bits) and shifting the - * result back down 2x bits, we round down to the nearest multiple. - */ - - /* ma */ - v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0, - (__vector unsigned long long)vconst1); - /* q = floor(ma/(2^64)) */ - v1 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero, - (__vector unsigned char)v1, 8); - /* qn */ - v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, - (__vector unsigned long long)vconst2); - /* a - qn, subtraction is xor in GF(2) */ - v0 = vec_xor (v0, v1); - /* - * Get the result into r3. We need to shift it left 8 bytes: - * V0 [ 0 1 2 X ] - * V0 [ 0 X 2 3 ] - */ - result = __builtin_unpack_vector_1 (v0); -#else - - /* - * The reflected version of Barrett reduction. Instead of bit - * reflecting our data (which is expensive to do), we bit reflect our - * constants and our algorithm, which means the intermediate data in - * our vector registers goes from 0-63 instead of 63-0. We can reflect - * the algorithm because we don't carry in mod 2 arithmetic. - */ - - /* bottom 32 bits of a */ - v1 = vec_and(v0, vmask_32bit); - - /* ma */ - v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, - (__vector unsigned long long)vconst1); - - /* bottom 32bits of ma */ - v1 = vec_and(v1, vmask_32bit); - /* qn */ - v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, - (__vector unsigned long long)vconst2); - /* a - qn, subtraction is xor in GF(2) */ - v0 = vec_xor (v0, v1); - - /* - * Since we are bit reflected, the result (ie the low 32 bits) is in - * the high 32 bits. We just need to shift it left 4 bytes - * V0 [ 0 1 X 3 ] - * V0 [ 0 X 2 3 ] - */ - - /* shift result into top 64 bits of */ - v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, - (__vector unsigned char)vzero, 4); - - result = __builtin_unpack_vector_0 (v0); -#endif - - return result; -} +#define POWER8_INTRINSICS +#include "pcc_crc32_constants.h" +#include "crc_ppc64.h" diff --git a/mysys/crc32/crc32_x86.c b/mysys/crc32/crc32_x86.c index 3f176a6c145..1e5d2a0a089 100644 --- a/mysys/crc32/crc32_x86.c +++ b/mysys/crc32/crc32_x86.c @@ -1,545 +1,358 @@ -/****************************************************** -Copyright (c) 2017 Percona LLC and/or its affiliates. +/* Copyright (c) 2020 MariaDB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Implementation of CRC32 (Ethernet) uing Intel PCLMULQDQ + Ported from Intels work, see https://github.com/intel/soft-crc +*/ + +/******************************************************************************* + Copyright (c) 2009-2018, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ -CRC32 using Intel's PCLMUL instruction. -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; version 2 of the License. +#include <my_global.h> +#include <my_compiler.h> -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdint.h> +#include <stddef.h> -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +#if defined(__GNUC__) +#include <x86intrin.h> +#include <cpuid.h> +#elif defined(_MSC_VER) +#include <intrin.h> +#else +#error "unknown compiler" +#endif -*******************************************************/ +static int has_sse42_and_pclmul(uint32_t recx) +{ + /* 1 << 20 is SSE42, 1 << 1 is PCLMULQDQ */ +#define bits_SSE42_AND_PCLMUL (1 << 20 | 1 << 1) + return (recx & bits_SSE42_AND_PCLMUL) == bits_SSE42_AND_PCLMUL; +} -/* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation - * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi> +#ifdef __GNUC__ +int crc32_pclmul_enabled(void) +{ + uint32_t reax= 0, rebx= 0, recx= 0, redx= 0; + __cpuid(1, reax, rebx, recx, redx); + return has_sse42_and_pclmul(recx); +} +#elif defined(_MSC_VER) +int crc32_pclmul_enabled(void) +{ + int regs[4]; + __cpuid(regs, 1); + return has_sse42_and_pclmul(regs[2]); +} +#endif + +/** + * @brief Shifts left 128 bit register by specified number of bytes * - * This file is part of Libgcrypt. + * @param reg 128 bit value + * @param num number of bytes to shift left \a reg by (0-16) * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. + * @return \a reg << (\a num * 8) + */ +static inline __m128i xmm_shift_left(__m128i reg, const unsigned int num) +{ + static const MY_ALIGNED(16) uint8_t crc_xmm_shift_tab[48]= { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + + const __m128i *p= (const __m128i *) (crc_xmm_shift_tab + 16 - num); + + return _mm_shuffle_epi8(reg, _mm_loadu_si128(p)); +} + +struct crcr_pclmulqdq_ctx +{ + uint64_t rk1; + uint64_t rk2; + uint64_t rk5; + uint64_t rk6; + uint64_t rk7; + uint64_t rk8; +}; + +/** + * @brief Performs one folding round * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * Logically function operates as follows: + * DATA = READ_NEXT_16BYTES(); + * F1 = LSB8(FOLD) + * F2 = MSB8(FOLD) + * T1 = CLMUL(F1, RK1) + * T2 = CLMUL(F2, RK2) + * FOLD = XOR(T1, T2, DATA) * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + * @param data_block 16 byte data block + * @param precomp precomputed rk1 constanst + * @param fold running 16 byte folded data * + * @return New 16 byte folded data */ +static inline __m128i crcr32_folding_round(const __m128i data_block, + const __m128i precomp, const __m128i fold) +{ + __m128i tmp0= _mm_clmulepi64_si128(fold, precomp, 0x01); + __m128i tmp1= _mm_clmulepi64_si128(fold, precomp, 0x10); -#include <my_global.h> + return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0)); +} -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <stdint.h> +/** + * @brief Performs reduction from 128 bits to 64 bits + * + * @param data128 128 bits data to be reduced + * @param precomp rk5 and rk6 precomputed constants + * + * @return data reduced to 64 bits + */ +static inline __m128i crcr32_reduce_128_to_64(__m128i data128, const __m128i precomp) +{ + __m128i tmp0, tmp1, tmp2; -# define U64_C(c) (c ## UL) + /* 64b fold */ + tmp0= _mm_clmulepi64_si128(data128, precomp, 0x00); + tmp1= _mm_srli_si128(data128, 8); + tmp0= _mm_xor_si128(tmp0, tmp1); -typedef uint32_t u32; -typedef uint16_t u16; -typedef uint64_t u64; -#ifndef byte -typedef uint8_t byte; -#endif + /* 32b fold */ + tmp2= _mm_slli_si128(tmp0, 4); + tmp1= _mm_clmulepi64_si128(tmp2, precomp, 0x10); -# define _gcry_bswap32 __builtin_bswap32 + return _mm_xor_si128(tmp1, tmp0); +} -#if __GNUC__ >= 4 && defined(__x86_64__) +/** + * @brief Performs Barret's reduction from 64 bits to 32 bits + * + * @param data64 64 bits data to be reduced + * @param precomp rk7 precomputed constant + * + * @return data reduced to 32 bits + */ +static inline uint32_t crcr32_reduce_64_to_32(__m128i data64, const __m128i precomp) +{ + static const MY_ALIGNED(16) uint32_t mask1[4]= { + 0xffffffff, 0xffffffff, 0x00000000, 0x00000000}; + static const MY_ALIGNED(16) uint32_t mask2[4]= { + 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff}; + __m128i tmp0, tmp1, tmp2; -#if defined(_GCRY_GCC_VERSION) && _GCRY_GCC_VERSION >= 40400 /* 4.4 */ -/* Prevent compiler from issuing SSE instructions between asm blocks. */ -# pragma GCC target("no-sse") -#endif + tmp0= _mm_and_si128(data64, _mm_load_si128((__m128i *) mask2)); + tmp1= _mm_clmulepi64_si128(tmp0, precomp, 0x00); + tmp1= _mm_xor_si128(tmp1, tmp0); + tmp1= _mm_and_si128(tmp1, _mm_load_si128((__m128i *) mask1)); -#define ALIGNED_16 __attribute__ ((aligned (16))) + tmp2= _mm_clmulepi64_si128(tmp1, precomp, 0x10); + tmp2= _mm_xor_si128(tmp2, tmp1); + tmp2= _mm_xor_si128(tmp2, tmp0); + return _mm_extract_epi32(tmp2, 2); +} -struct u16_unaligned_s +/** + * @brief Calculates reflected 32-bit CRC for given \a data block + * by applying folding and reduction methods. + * + * Algorithm operates on 32 bit CRCs. + * Polynomials and initial values may need to be promoted to + * 32 bits where required. + * + * @param crc initial CRC value (32 bit value) + * @param data pointer to data block + * @param data_len length of \a data block in bytes + * @param params pointer to PCLMULQDQ CRC calculation context + * + * @return CRC for given \a data block (32 bits wide). + */ +static inline uint32_t crcr32_calc_pclmulqdq(const uint8_t *data, uint32_t data_len, + uint32_t crc, + const struct crcr_pclmulqdq_ctx *params) { - u16 a; -} __attribute__((packed, aligned (1), may_alias)); + __m128i temp, fold, k; + uint32_t n; + DBUG_ASSERT(data != NULL || data_len == 0); + DBUG_ASSERT(params); -/* Constants structure for generic reflected/non-reflected CRC32 CLMUL - * functions. */ -struct crc32_consts_s -{ - /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */ - u64 k[6]; - /* my_p: { floor(x^64 / P(x)), P(x) } */ - u64 my_p[2]; -}; + if (unlikely(data_len == 0)) + return crc; + /** + * Get CRC init value + */ + temp= _mm_insert_epi32(_mm_setzero_si128(), crc, 0); -/* CLMUL constants for CRC32 and CRC32RFC1510. */ -static const struct crc32_consts_s crc32_consts ALIGNED_16 = -{ - { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */ - U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */ - U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */ - U64_C(0x163cd6124), 0 /* y = 2 */ - }, - { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */ - U64_C(0x1f7011641), U64_C(0x1db710641) - } -}; + /** + * ------------------------------------------------- + * Folding all data into single 16 byte data block + * Assumes: \a fold holds first 16 bytes of data + */ -/* Common constants for CRC32 algorithms. */ -static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 = - { - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - }; -static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 = - { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - }; -static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 = + if (unlikely(data_len < 32)) { - { U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f) }, /* 9 */ - { U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e) }, - { U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d) }, - { U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c) }, - { U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b) }, - { U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a) }, - { U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09) }, /* 15 */ - }; -static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 = - { - { U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff) }, /* 5 */ - { U64_C(0xffff070603020100), U64_C(0xffffffffffffffff) }, - { U64_C(0xff07060503020100), U64_C(0xffffffffffffffff) }, /* 7 */ - }; - -/* PCLMUL functions for reflected CRC32. */ -static inline void -crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, - const struct crc32_consts_s *consts) -{ - if (inlen >= 8 * 16) + if (unlikely(data_len == 16)) { - asm volatile ("movd %[crc], %%xmm4\n\t" - "movdqu %[inbuf_0], %%xmm0\n\t" - "movdqu %[inbuf_1], %%xmm1\n\t" - "movdqu %[inbuf_2], %%xmm2\n\t" - "movdqu %[inbuf_3], %%xmm3\n\t" - "pxor %%xmm4, %%xmm0\n\t" - : - : [inbuf_0] "m" (inbuf[0 * 16]), - [inbuf_1] "m" (inbuf[1 * 16]), - [inbuf_2] "m" (inbuf[2 * 16]), - [inbuf_3] "m" (inbuf[3 * 16]), - [crc] "m" (*pcrc) - ); - - inbuf += 4 * 16; - inlen -= 4 * 16; - - asm volatile ("movdqa %[k1k2], %%xmm4\n\t" - : - : [k1k2] "m" (consts->k[1 - 1]) - ); - - /* Fold by 4. */ - while (inlen >= 4 * 16) - { - asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t" - "movdqa %%xmm0, %%xmm6\n\t" - "pclmulqdq $0x00, %%xmm4, %%xmm0\n\t" - "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" - "pxor %%xmm5, %%xmm0\n\t" - "pxor %%xmm6, %%xmm0\n\t" - - "movdqu %[inbuf_1], %%xmm5\n\t" - "movdqa %%xmm1, %%xmm6\n\t" - "pclmulqdq $0x00, %%xmm4, %%xmm1\n\t" - "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" - "pxor %%xmm5, %%xmm1\n\t" - "pxor %%xmm6, %%xmm1\n\t" - - "movdqu %[inbuf_2], %%xmm5\n\t" - "movdqa %%xmm2, %%xmm6\n\t" - "pclmulqdq $0x00, %%xmm4, %%xmm2\n\t" - "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" - "pxor %%xmm5, %%xmm2\n\t" - "pxor %%xmm6, %%xmm2\n\t" - - "movdqu %[inbuf_3], %%xmm5\n\t" - "movdqa %%xmm3, %%xmm6\n\t" - "pclmulqdq $0x00, %%xmm4, %%xmm3\n\t" - "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" - "pxor %%xmm5, %%xmm3\n\t" - "pxor %%xmm6, %%xmm3\n\t" - : - : [inbuf_0] "m" (inbuf[0 * 16]), - [inbuf_1] "m" (inbuf[1 * 16]), - [inbuf_2] "m" (inbuf[2 * 16]), - [inbuf_3] "m" (inbuf[3 * 16]) - ); - - inbuf += 4 * 16; - inlen -= 4 * 16; - } - - asm volatile ("movdqa %[k3k4], %%xmm6\n\t" - "movdqa %[my_p], %%xmm5\n\t" - : - : [k3k4] "m" (consts->k[3 - 1]), - [my_p] "m" (consts->my_p[0]) - ); - - /* Fold 4 to 1. */ - - asm volatile ("movdqa %%xmm0, %%xmm4\n\t" - "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" - "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t" - "pxor %%xmm1, %%xmm0\n\t" - "pxor %%xmm4, %%xmm0\n\t" - - "movdqa %%xmm0, %%xmm4\n\t" - "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" - "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t" - "pxor %%xmm2, %%xmm0\n\t" - "pxor %%xmm4, %%xmm0\n\t" - - "movdqa %%xmm0, %%xmm4\n\t" - "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" - "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t" - "pxor %%xmm3, %%xmm0\n\t" - "pxor %%xmm4, %%xmm0\n\t" - : - : - ); + /* 16 bytes */ + fold= _mm_loadu_si128((__m128i *) data); + fold= _mm_xor_si128(fold, temp); + goto reduction_128_64; } - else + if (unlikely(data_len < 16)) { - asm volatile ("movd %[crc], %%xmm1\n\t" - "movdqu %[inbuf], %%xmm0\n\t" - "movdqa %[k3k4], %%xmm6\n\t" - "pxor %%xmm1, %%xmm0\n\t" - "movdqa %[my_p], %%xmm5\n\t" - : - : [inbuf] "m" (*inbuf), - [crc] "m" (*pcrc), - [k3k4] "m" (consts->k[3 - 1]), - [my_p] "m" (consts->my_p[0]) - ); - - inbuf += 16; - inlen -= 16; + /* 0 to 15 bytes */ + MY_ALIGNED(16) uint8_t buffer[16]; + + memset(buffer, 0, sizeof(buffer)); + memcpy(buffer, data, data_len); + + fold= _mm_load_si128((__m128i *) buffer); + fold= _mm_xor_si128(fold, temp); + if ((data_len < 4)) + { + fold= xmm_shift_left(fold, 8 - data_len); + goto barret_reduction; + } + fold= xmm_shift_left(fold, 16 - data_len); + goto reduction_128_64; } + /* 17 to 31 bytes */ + fold= _mm_loadu_si128((__m128i *) data); + fold= _mm_xor_si128(fold, temp); + n= 16; + k= _mm_load_si128((__m128i *) (¶ms->rk1)); + goto partial_bytes; + } - /* Fold by 1. */ - if (inlen >= 16) - { - while (inlen >= 16) - { - /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */ - asm volatile ("movdqu %[inbuf], %%xmm2\n\t" - "movdqa %%xmm0, %%xmm1\n\t" - "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" - "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t" - "pxor %%xmm2, %%xmm0\n\t" - "pxor %%xmm1, %%xmm0\n\t" - : - : [inbuf] "m" (*inbuf) - ); - - inbuf += 16; - inlen -= 16; - } - } + /** + * At least 32 bytes in the buffer + */ + + /** + * Apply CRC initial value + */ + fold= _mm_loadu_si128((const __m128i *) data); + fold= _mm_xor_si128(fold, temp); + + /** + * Main folding loop + * - the last 16 bytes is processed separately + */ + k= _mm_load_si128((__m128i *) (¶ms->rk1)); + for (n= 16; (n + 16) <= data_len; n+= 16) + { + temp= _mm_loadu_si128((__m128i *) &data[n]); + fold= crcr32_folding_round(temp, k, fold); + } - /* Partial fold. */ - if (inlen) - { - /* Load last input and add padding zeros. */ - asm volatile ("movdqu %[shr_shuf], %%xmm3\n\t" - "movdqu %[shl_shuf], %%xmm4\n\t" - "movdqu %[mask], %%xmm2\n\t" - - "movdqa %%xmm0, %%xmm1\n\t" - "pshufb %%xmm4, %%xmm0\n\t" - "movdqu %[inbuf], %%xmm4\n\t" - "pshufb %%xmm3, %%xmm1\n\t" - "pand %%xmm4, %%xmm2\n\t" - "por %%xmm1, %%xmm2\n\t" - - "movdqa %%xmm0, %%xmm1\n\t" - "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" - "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t" - "pxor %%xmm2, %%xmm0\n\t" - "pxor %%xmm1, %%xmm0\n\t" - : - : [inbuf] "m" (*(inbuf - 16 + inlen)), - [mask] "m" (crc32_partial_fold_input_mask[inlen]), - [shl_shuf] "m" (crc32_refl_shuf_shift[inlen]), - [shr_shuf] "m" (crc32_refl_shuf_shift[inlen + 16]) - ); - - inbuf += inlen; - inlen -= inlen; - } +partial_bytes: + if (likely(n < data_len)) + { + static const MY_ALIGNED(16) uint32_t mask3[4]= {0x80808080, 0x80808080, + 0x80808080, 0x80808080}; + static const MY_ALIGNED(16) uint8_t shf_table[32]= { + 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, + 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + __m128i last16, a, b; - /* Final fold. */ - asm volatile (/* reduce 128-bits to 96-bits */ - "movdqa %%xmm0, %%xmm1\n\t" - "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t" - "psrldq $8, %%xmm1\n\t" - "pxor %%xmm1, %%xmm0\n\t" - - /* reduce 96-bits to 64-bits */ - "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */ - "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */ - "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */ - "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */ - - /* barrett reduction */ - "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */ - "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */ - "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ - "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ - "pxor %%xmm1, %%xmm0\n\t" - - /* store CRC */ - "pextrd $2, %%xmm0, %[out]\n\t" - : [out] "=m" (*pcrc) - : [k5] "m" (consts->k[5 - 1]) - ); -} + last16= _mm_loadu_si128((const __m128i *) &data[data_len - 16]); -static inline void -crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen, - const struct crc32_consts_s *consts) -{ - if (inlen < 4) - { - u32 crc = *pcrc; - u32 data; - - asm volatile ("movdqa %[my_p], %%xmm5\n\t" - : - : [my_p] "m" (consts->my_p[0]) - ); - - if (inlen == 1) - { - data = inbuf[0]; - data ^= crc; - data <<= 24; - crc >>= 8; - } - else if (inlen == 2) - { - data = ((const struct u16_unaligned_s *)inbuf)->a; - data ^= crc; - data <<= 16; - crc >>= 16; - } - else - { - data = ((const struct u16_unaligned_s *)inbuf)->a; - data |= ((u32) inbuf[2]) << 16; - data ^= crc; - data <<= 8; - crc >>= 24; - } - - /* Barrett reduction */ - asm volatile ("movd %[in], %%xmm0\n\t" - "movd %[crc], %%xmm1\n\t" - - "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ - "psllq $32, %%xmm1\n\t" - "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */ - "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ - "pxor %%xmm1, %%xmm0\n\t" - - "pextrd $1, %%xmm0, %[out]\n\t" - : [out] "=m" (*pcrc) - : [in] "rm" (data), - [crc] "rm" (crc) - ); - } - else if (inlen == 4) - { - /* Barrett reduction */ - asm volatile ("movd %[crc], %%xmm1\n\t" - "movd %[in], %%xmm0\n\t" - "movdqa %[my_p], %%xmm5\n\t" - "pxor %%xmm1, %%xmm0\n\t" - - "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ - "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */ - "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ - - "pextrd $1, %%xmm0, %[out]\n\t" - : [out] "=m" (*pcrc) - : [in] "m" (*inbuf), - [crc] "m" (*pcrc), - [my_p] "m" (consts->my_p[0]) - ); - } - else - { - asm volatile ("movdqu %[shuf], %%xmm4\n\t" - "movd %[crc], %%xmm1\n\t" - "movdqa %[my_p], %%xmm5\n\t" - "movdqa %[k3k4], %%xmm6\n\t" - : - : [shuf] "m" (crc32_refl_shuf_shift[inlen]), - [crc] "m" (*pcrc), - [my_p] "m" (consts->my_p[0]), - [k3k4] "m" (consts->k[3 - 1]) - ); - - if (inlen >= 8) - { - asm volatile ("movq %[inbuf], %%xmm0\n\t" - : - : [inbuf] "m" (*inbuf) - ); - if (inlen > 8) - { - asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/ - "movq %[inbuf_tail], %%xmm2\n\t" - "punpcklqdq %%xmm2, %%xmm0\n\t" - "pshufb %[merge_shuf], %%xmm0\n\t" - : - : [inbuf_tail] "m" (inbuf[inlen - 8]), - [merge_shuf] "m" - (*crc32_merge9to15_shuf[inlen - 9]) - ); - } - } - else - { - asm volatile ("movd %[inbuf], %%xmm0\n\t" - "pinsrd $1, %[inbuf_tail], %%xmm0\n\t" - "pshufb %[merge_shuf], %%xmm0\n\t" - : - : [inbuf] "m" (*inbuf), - [inbuf_tail] "m" (inbuf[inlen - 4]), - [merge_shuf] "m" - (*crc32_merge5to7_shuf[inlen - 5]) - ); - } - - /* Final fold. */ - asm volatile ("pxor %%xmm1, %%xmm0\n\t" - "pshufb %%xmm4, %%xmm0\n\t" - - /* reduce 128-bits to 96-bits */ - "movdqa %%xmm0, %%xmm1\n\t" - "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t" - "psrldq $8, %%xmm1\n\t" - "pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */ - - /* reduce 96-bits to 64-bits */ - "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */ - "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */ - "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */ - "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */ - - /* barrett reduction */ - "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */ - "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */ - "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ - "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ - "pxor %%xmm1, %%xmm0\n\t" - - /* store CRC */ - "pextrd $2, %%xmm0, %[out]\n\t" - : [out] "=m" (*pcrc) - : [k5] "m" (consts->k[5 - 1]) - ); - } -} + temp= _mm_loadu_si128((const __m128i *) &shf_table[data_len & 15]); + a= _mm_shuffle_epi8(fold, temp); -void -crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen) -{ - const struct crc32_consts_s *consts = &crc32_consts; -#if defined(__x86_64__) && defined(__WIN64__) - char win64tmp[2 * 16]; - - /* XMM6-XMM7 need to be restored after use. */ - asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" - "movdqu %%xmm7, 1*16(%0)\n\t" - : - : "r" (win64tmp) - : "memory"); -#endif + temp= _mm_xor_si128(temp, _mm_load_si128((const __m128i *) mask3)); + b= _mm_shuffle_epi8(fold, temp); + b= _mm_blendv_epi8(b, last16, temp); - if (!inlen) - return; - - if (inlen >= 16) - crc32_reflected_bulk(pcrc, inbuf, inlen, consts); - else - crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts); - -#if defined(__x86_64__) && defined(__WIN64__) - /* Restore used registers. */ - asm volatile("movdqu 0*16(%0), %%xmm6\n\t" - "movdqu 1*16(%0), %%xmm7\n\t" - : - : "r" (win64tmp) - : "memory"); -#endif -} + /* k = rk1 & rk2 */ + temp= _mm_clmulepi64_si128(a, k, 0x01); + fold= _mm_clmulepi64_si128(a, k, 0x10); -#ifdef __GNUC__ -int crc32_pclmul_enabled(void) -{ - int eax, ecx; - /* We assume that the CPUID instruction and its parameter 1 are available. - We do not support any precursors of the Intel 80486. */ - asm("cpuid" : "=a"(eax), "=c"(ecx) : "0"(1) : "ebx", "edx"); - return !(~ecx & (1 << 19 | 1 << 1)); -} -#elif 0 /* defined _MSC_VER */ /* FIXME: implement the pclmul interface */ -#include <intrin.h> -int crc32_pclmul_enabled(void) -{ - /* We assume that the CPUID instruction and its parameter 1 are available. - We do not support any precursors of the Intel 80486. */ - int regs[4]; - __cpuid(regs, 1); - return !(~regs[2] & (1 << 19 | 1 << 1)); -} -#else -int crc32_pclmul_enabled(void) -{ - return 0; + fold= _mm_xor_si128(fold, temp); + fold= _mm_xor_si128(fold, b); + } + + /** + * ------------------------------------------------- + * Reduction 128 -> 32 + * Assumes: \a fold holds 128bit folded data + */ +reduction_128_64: + k= _mm_load_si128((__m128i *) (¶ms->rk5)); + fold= crcr32_reduce_128_to_64(fold, k); + +barret_reduction: + k= _mm_load_si128((__m128i *) (¶ms->rk7)); + n= crcr32_reduce_64_to_32(fold, k); + return n; } -#endif +static const MY_ALIGNED(16) struct crcr_pclmulqdq_ctx ether_crc32_clmul= { + 0xccaa009e, /**< rk1 */ + 0x1751997d0, /**< rk2 */ + 0xccaa009e, /**< rk5 */ + 0x163cd6124, /**< rk6 */ + 0x1f7011640, /**< rk7 */ + 0x1db710641 /**< rk8 */ +}; + +/** + * @brief Calculates Ethernet CRC32 using PCLMULQDQ method. + * + * @param data pointer to data block to calculate CRC for + * @param data_len size of data block + * + * @return New CRC value + */ unsigned int crc32_pclmul(unsigned int crc32, const void *buf, size_t len) { - crc32= ~crc32; - crc32_intel_pclmul(&crc32, buf, len); - return ~crc32; + return ~crcr32_calc_pclmulqdq(buf, (uint32_t)len, ~crc32, ðer_crc32_clmul); } -#endif diff --git a/mysys/crc32/crc32c.cc b/mysys/crc32/crc32c.cc new file mode 100644 index 00000000000..4eaceb8c438 --- /dev/null +++ b/mysys/crc32/crc32c.cc @@ -0,0 +1,1254 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. + +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + + +#include <stddef.h> +#include <stdint.h> +#include <string> +#include <my_global.h> +#include <my_byteorder.h> +static inline uint32_t DecodeFixed32(const char *ptr) +{ + return uint4korr(ptr); +} + +static inline uint64_t DecodeFixed64(const char *ptr) +{ + return uint8korr(ptr); +} + +#include <stdint.h> +#ifdef _MSC_VER +#include <intrin.h> +#endif + +#ifdef HAVE_SSE42 +#include <nmmintrin.h> +#include <wmmintrin.h> +#ifdef __GNUC__ +#include <cpuid.h> +#endif +#endif + + +#ifdef __powerpc64__ +#include "crc32c_ppc.h" + +#if __linux__ +#include <sys/auxv.h> + +#ifndef PPC_FEATURE2_VEC_CRYPTO +#define PPC_FEATURE2_VEC_CRYPTO 0x02000000 +#endif + +#ifndef AT_HWCAP2 +#define AT_HWCAP2 26 +#endif + +#endif /* __linux__ */ + +#endif + +namespace mysys_namespace { +namespace crc32c { + +#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) +#ifdef __powerpc64__ +static int arch_ppc_crc32 = 0; +#endif /* __powerpc64__ */ +#endif + +static const uint32_t table0_[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 +}; +static const uint32_t table1_[256] = { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, + 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, + 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, + 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, + 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, + 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, + 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, + 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, + 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, + 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, + 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, + 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, + 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, + 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, + 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, + 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, + 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, + 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, + 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, + 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, + 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, + 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 +}; +static const uint32_t table2_[256] = { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, + 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, + 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, + 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, + 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, + 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, + 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, + 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, + 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, + 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, + 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, + 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, + 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, + 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, + 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, + 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, + 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, + 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, + 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, + 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, + 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, + 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 +}; +static const uint32_t table3_[256] = { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, + 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, + 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, + 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, + 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, + 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, + 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, + 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, + 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, + 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, + 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, + 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, + 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, + 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, + 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, + 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, + 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, + 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, + 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, + 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, + 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, + 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 +}; + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + return DecodeFixed32(reinterpret_cast<const char*>(p)); +} + +#if defined(HAVE_SSE42) && (SIZEOF_SIZE_T == 8) +static inline uint64_t LE_LOAD64(const uint8_t *p) { + return DecodeFixed64(reinterpret_cast<const char*>(p)); +} +#endif + +static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { + uint32_t c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p)); + *p += 4; + *l = table3_[c & 0xff] ^ + table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ + table0_[c >> 24]; + // DO it twice. + c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p)); + *p += 4; + *l = table3_[c & 0xff] ^ + table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ + table0_[c >> 24]; +} + +static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) { +#ifndef HAVE_SSE42 + Slow_CRC32(l, p); +#elif (SIZEOF_SIZE_T == 8) + *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); + *p += 8; +#else + *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p)); + *p += 4; + *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p)); + *p += 4; +#endif +} + +template<void (*CRC32)(uint64_t*, uint8_t const**)> +uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { + + const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); + const uint8_t *e = p + size; + uint64_t l = crc ^ 0xffffffffu; + +// Align n to (1 << m) byte boundary +#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1)) + +#define STEP1 do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ +} while (0) + + + // Point x at first 16-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast<uintptr_t>(p); + const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4)); + if (x <= e) { + // Process bytes until finished or p is 16-byte aligned + while (p != x) { + STEP1; + } + } + // Process bytes 16 at a time + while ((e-p) >= 16) { + CRC32(&l, &p); + CRC32(&l, &p); + } + // Process bytes 8 at a time + while ((e-p) >= 8) { + CRC32(&l, &p); + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP1 +#undef ALIGN + return static_cast<uint32_t>(l ^ 0xffffffffu); +} + +// Detect if ARM64 CRC or not. +#ifndef HAVE_ARMV8_CRC +// Detect if SS42 or not. +#ifndef HAVE_POWER8 + +static bool isSSE42() { +#ifndef HAVE_SSE42 + return false; +#elif defined(__GNUC__) + uint32_t reax= 0, rebx= 0, recx= 0, redx= 0; + __cpuid(1, reax, rebx, recx, redx); + return (recx & ((int)1 << 20)) != 0; +#elif defined(_MSC_VER) + int info[4]; + __cpuid(info, 0x00000001); + return (info[2] & ((int)1 << 20)) != 0; +#else + return false; +#endif +} + +#ifdef HAVE_SSE42 +extern "C" int crc32_pclmul_enabled(); +#endif + +static bool isPCLMULQDQ() { +#ifdef HAVE_SSE42 + return crc32_pclmul_enabled(); +#else + return false; +#endif +} + +#endif // HAVE_POWER8 +#endif // HAVE_ARMV8_CRC + +typedef uint32_t (*Function)(uint32_t, const char*, size_t); + +#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) +uint32_t ExtendPPCImpl(uint32_t crc, const char *buf, size_t size) { + return crc32c_ppc(crc, (const unsigned char *)buf, size); +} + +#if __linux__ +static int arch_ppc_probe(void) { + arch_ppc_crc32 = 0; + +#if defined(__powerpc64__) + if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1; +#endif /* __powerpc64__ */ + + return arch_ppc_crc32; +} +#endif // __linux__ + +static bool isAltiVec() { + if (arch_ppc_probe()) { + return true; + } else { + return false; + } +} +#endif + +#if defined(HAVE_ARMV8_CRC) +extern "C" const char *crc32c_aarch64_available(void); +extern "C" uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len); + +static uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) { + return crc32c_aarch64(crc, (const unsigned char *)buf, (size_t) size); +} +#endif + +extern "C" const char * my_crc32c_implementation() +{ +#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) + if (arch_ppc_probe()) + return "Using POWER8 crc32 instructions"; +#elif defined(HAVE_ARMV8_CRC) + const char *ret = crc32c_aarch64_available(); + if (ret) + return ret ; +#elif HAVE_SSE42 + if (isSSE42()) + { + if (SIZEOF_SIZE_T == 8 && isPCLMULQDQ()) + return "Using crc32 + pclmulqdq instructions"; + return "Using SSE4.2 crc32 instructions"; + } +#endif + return "Using generic crc32 instructions"; +} + + +/* + * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * Ferry Toth + * ftoth@exalondelft.nl + * + * https://github.com/htot/crc32c + * + * Modified by Facebook + * + * Original intel whitepaper: + * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction" + * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * + * This version is from the folly library, created by Dave Watson <davejwatson@fb.com> + * +*/ +#if defined HAVE_SSE42 && defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8 + + +#define CRCtriplet(crc, buf, offset) \ + crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ + crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \ + crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset)); + +#define CRCduplet(crc, buf, offset) \ + crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ + crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); + +#define CRCsinglet(crc, buf, offset) \ + crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset)); + + +// Numbers taken directly from intel whitepaper. +// clang-format off +static const uint64_t clmul_constants[] = { + 0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6, + 0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e, + 0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da, + 0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8, + 0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296, + 0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2, + 0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6, + 0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092, + 0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0, + 0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456, + 0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e, + 0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a, + 0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574, + 0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832, + 0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124, + 0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86, + 0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e, + 0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a, + 0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46, + 0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a, + 0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a, + 0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4, + 0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56, + 0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2, + 0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c, + 0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac, + 0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64, + 0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e, + 0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c, + 0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28, + 0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26, + 0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c, + 0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c, + 0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c, + 0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4, + 0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844, + 0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c, + 0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730, + 0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c, + 0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2, + 0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2, + 0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e, + 0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a, + 0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a, + 0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a, + 0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768, + 0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4, + 0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c, + 0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba, + 0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312, + 0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544, + 0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a, + 0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e, + 0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a, + 0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c, + 0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a, + 0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6, + 0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca, + 0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888, + 0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e, + 0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528, + 0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a, + 0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e, + 0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa, +}; + +// Compute the crc32c value for buffer smaller than 8 +static inline void align_to_8( + size_t len, + uint64_t& crc0, // crc so far, updated on return + const unsigned char*& next) { // next data pointer, updated on return + uint32_t crc32bit = static_cast<uint32_t>(crc0); + if (len & 0x04) { + crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next); + next += sizeof(uint32_t); + } + if (len & 0x02) { + crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next); + next += sizeof(uint16_t); + } + if (len & 0x01) { + crc32bit = _mm_crc32_u8(crc32bit, *(next)); + next++; + } + crc0 = crc32bit; +} + +// +// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well +// chosen constant and xor's these with the remaining CRC. +// +static inline uint64_t CombineCRC( + size_t block_size, + uint64_t crc0, + uint64_t crc1, + uint64_t crc2, + const uint64_t* next2) { + const auto multiplier = + *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1); + const auto crc0_xmm = _mm_set_epi64x(0, crc0); + const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00); + const auto crc1_xmm = _mm_set_epi64x(0, crc1); + const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10); + const auto res = _mm_xor_si128(res0, res1); + crc0 = _mm_cvtsi128_si64(res); + crc0 = crc0 ^ *((uint64_t*)next2 - 1); + crc2 = _mm_crc32_u64(crc2, crc0); + return crc2; +} + +// Compute CRC-32C using the Intel hardware instruction. +static inline uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) { + const unsigned char* next = (const unsigned char*)buf; + uint64_t count; + uint64_t crc0, crc1, crc2; + crc0 = crc ^ 0xffffffffu; + + + if (len >= 8) { + // if len > 216 then align and use triplets + if (len > 216) { + { + // Work on the bytes (< 8) before the first 8-byte alignment addr starts + auto align_bytes = (8 - (uintptr_t)next) & 7; + len -= align_bytes; + align_to_8(align_bytes, crc0, next); + } + + // Now work on the remaining blocks + count = len / 24; // number of triplets + len %= 24; // bytes remaining + uint64_t n = count >> 7; // #blocks = first block + full blocks + uint64_t block_size = count & 127; + if (block_size == 0) { + block_size = 128; + } else { + n++; + } + // points to the first byte of the next block + const uint64_t* next0 = (uint64_t*)next + block_size; + const uint64_t* next1 = next0 + block_size; + const uint64_t* next2 = next1 + block_size; + + crc1 = crc2 = 0; + // Use Duff's device, a for() loop inside a switch() + // statement. This needs to execute at least once, round len + // down to nearest triplet multiple + switch (block_size) { + case 128: + do { + // jumps here for a full block of len 128 + CRCtriplet(crc, next, -128); + /* fallthrough */ + case 127: + // jumps here or below for the first block smaller + CRCtriplet(crc, next, -127); + /* fallthrough */ + case 126: + CRCtriplet(crc, next, -126); // than 128 + /* fallthrough */ + case 125: + CRCtriplet(crc, next, -125); + /* fallthrough */ + case 124: + CRCtriplet(crc, next, -124); + /* fallthrough */ + case 123: + CRCtriplet(crc, next, -123); + /* fallthrough */ + case 122: + CRCtriplet(crc, next, -122); + /* fallthrough */ + case 121: + CRCtriplet(crc, next, -121); + /* fallthrough */ + case 120: + CRCtriplet(crc, next, -120); + /* fallthrough */ + case 119: + CRCtriplet(crc, next, -119); + /* fallthrough */ + case 118: + CRCtriplet(crc, next, -118); + /* fallthrough */ + case 117: + CRCtriplet(crc, next, -117); + /* fallthrough */ + case 116: + CRCtriplet(crc, next, -116); + /* fallthrough */ + case 115: + CRCtriplet(crc, next, -115); + /* fallthrough */ + case 114: + CRCtriplet(crc, next, -114); + /* fallthrough */ + case 113: + CRCtriplet(crc, next, -113); + /* fallthrough */ + case 112: + CRCtriplet(crc, next, -112); + /* fallthrough */ + case 111: + CRCtriplet(crc, next, -111); + /* fallthrough */ + case 110: + CRCtriplet(crc, next, -110); + /* fallthrough */ + case 109: + CRCtriplet(crc, next, -109); + /* fallthrough */ + case 108: + CRCtriplet(crc, next, -108); + /* fallthrough */ + case 107: + CRCtriplet(crc, next, -107); + /* fallthrough */ + case 106: + CRCtriplet(crc, next, -106); + /* fallthrough */ + case 105: + CRCtriplet(crc, next, -105); + /* fallthrough */ + case 104: + CRCtriplet(crc, next, -104); + /* fallthrough */ + case 103: + CRCtriplet(crc, next, -103); + /* fallthrough */ + case 102: + CRCtriplet(crc, next, -102); + /* fallthrough */ + case 101: + CRCtriplet(crc, next, -101); + /* fallthrough */ + case 100: + CRCtriplet(crc, next, -100); + /* fallthrough */ + case 99: + CRCtriplet(crc, next, -99); + /* fallthrough */ + case 98: + CRCtriplet(crc, next, -98); + /* fallthrough */ + case 97: + CRCtriplet(crc, next, -97); + /* fallthrough */ + case 96: + CRCtriplet(crc, next, -96); + /* fallthrough */ + case 95: + CRCtriplet(crc, next, -95); + /* fallthrough */ + case 94: + CRCtriplet(crc, next, -94); + /* fallthrough */ + case 93: + CRCtriplet(crc, next, -93); + /* fallthrough */ + case 92: + CRCtriplet(crc, next, -92); + /* fallthrough */ + case 91: + CRCtriplet(crc, next, -91); + /* fallthrough */ + case 90: + CRCtriplet(crc, next, -90); + /* fallthrough */ + case 89: + CRCtriplet(crc, next, -89); + /* fallthrough */ + case 88: + CRCtriplet(crc, next, -88); + /* fallthrough */ + case 87: + CRCtriplet(crc, next, -87); + /* fallthrough */ + case 86: + CRCtriplet(crc, next, -86); + /* fallthrough */ + case 85: + CRCtriplet(crc, next, -85); + /* fallthrough */ + case 84: + CRCtriplet(crc, next, -84); + /* fallthrough */ + case 83: + CRCtriplet(crc, next, -83); + /* fallthrough */ + case 82: + CRCtriplet(crc, next, -82); + /* fallthrough */ + case 81: + CRCtriplet(crc, next, -81); + /* fallthrough */ + case 80: + CRCtriplet(crc, next, -80); + /* fallthrough */ + case 79: + CRCtriplet(crc, next, -79); + /* fallthrough */ + case 78: + CRCtriplet(crc, next, -78); + /* fallthrough */ + case 77: + CRCtriplet(crc, next, -77); + /* fallthrough */ + case 76: + CRCtriplet(crc, next, -76); + /* fallthrough */ + case 75: + CRCtriplet(crc, next, -75); + /* fallthrough */ + case 74: + CRCtriplet(crc, next, -74); + /* fallthrough */ + case 73: + CRCtriplet(crc, next, -73); + /* fallthrough */ + case 72: + CRCtriplet(crc, next, -72); + /* fallthrough */ + case 71: + CRCtriplet(crc, next, -71); + /* fallthrough */ + case 70: + CRCtriplet(crc, next, -70); + /* fallthrough */ + case 69: + CRCtriplet(crc, next, -69); + /* fallthrough */ + case 68: + CRCtriplet(crc, next, -68); + /* fallthrough */ + case 67: + CRCtriplet(crc, next, -67); + /* fallthrough */ + case 66: + CRCtriplet(crc, next, -66); + /* fallthrough */ + case 65: + CRCtriplet(crc, next, -65); + /* fallthrough */ + case 64: + CRCtriplet(crc, next, -64); + /* fallthrough */ + case 63: + CRCtriplet(crc, next, -63); + /* fallthrough */ + case 62: + CRCtriplet(crc, next, -62); + /* fallthrough */ + case 61: + CRCtriplet(crc, next, -61); + /* fallthrough */ + case 60: + CRCtriplet(crc, next, -60); + /* fallthrough */ + case 59: + CRCtriplet(crc, next, -59); + /* fallthrough */ + case 58: + CRCtriplet(crc, next, -58); + /* fallthrough */ + case 57: + CRCtriplet(crc, next, -57); + /* fallthrough */ + case 56: + CRCtriplet(crc, next, -56); + /* fallthrough */ + case 55: + CRCtriplet(crc, next, -55); + /* fallthrough */ + case 54: + CRCtriplet(crc, next, -54); + /* fallthrough */ + case 53: + CRCtriplet(crc, next, -53); + /* fallthrough */ + case 52: + CRCtriplet(crc, next, -52); + /* fallthrough */ + case 51: + CRCtriplet(crc, next, -51); + /* fallthrough */ + case 50: + CRCtriplet(crc, next, -50); + /* fallthrough */ + case 49: + CRCtriplet(crc, next, -49); + /* fallthrough */ + case 48: + CRCtriplet(crc, next, -48); + /* fallthrough */ + case 47: + CRCtriplet(crc, next, -47); + /* fallthrough */ + case 46: + CRCtriplet(crc, next, -46); + /* fallthrough */ + case 45: + CRCtriplet(crc, next, -45); + /* fallthrough */ + case 44: + CRCtriplet(crc, next, -44); + /* fallthrough */ + case 43: + CRCtriplet(crc, next, -43); + /* fallthrough */ + case 42: + CRCtriplet(crc, next, -42); + /* fallthrough */ + case 41: + CRCtriplet(crc, next, -41); + /* fallthrough */ + case 40: + CRCtriplet(crc, next, -40); + /* fallthrough */ + case 39: + CRCtriplet(crc, next, -39); + /* fallthrough */ + case 38: + CRCtriplet(crc, next, -38); + /* fallthrough */ + case 37: + CRCtriplet(crc, next, -37); + /* fallthrough */ + case 36: + CRCtriplet(crc, next, -36); + /* fallthrough */ + case 35: + CRCtriplet(crc, next, -35); + /* fallthrough */ + case 34: + CRCtriplet(crc, next, -34); + /* fallthrough */ + case 33: + CRCtriplet(crc, next, -33); + /* fallthrough */ + case 32: + CRCtriplet(crc, next, -32); + /* fallthrough */ + case 31: + CRCtriplet(crc, next, -31); + /* fallthrough */ + case 30: + CRCtriplet(crc, next, -30); + /* fallthrough */ + case 29: + CRCtriplet(crc, next, -29); + /* fallthrough */ + case 28: + CRCtriplet(crc, next, -28); + /* fallthrough */ + case 27: + CRCtriplet(crc, next, -27); + /* fallthrough */ + case 26: + CRCtriplet(crc, next, -26); + /* fallthrough */ + case 25: + CRCtriplet(crc, next, -25); + /* fallthrough */ + case 24: + CRCtriplet(crc, next, -24); + /* fallthrough */ + case 23: + CRCtriplet(crc, next, -23); + /* fallthrough */ + case 22: + CRCtriplet(crc, next, -22); + /* fallthrough */ + case 21: + CRCtriplet(crc, next, -21); + /* fallthrough */ + case 20: + CRCtriplet(crc, next, -20); + /* fallthrough */ + case 19: + CRCtriplet(crc, next, -19); + /* fallthrough */ + case 18: + CRCtriplet(crc, next, -18); + /* fallthrough */ + case 17: + CRCtriplet(crc, next, -17); + /* fallthrough */ + case 16: + CRCtriplet(crc, next, -16); + /* fallthrough */ + case 15: + CRCtriplet(crc, next, -15); + /* fallthrough */ + case 14: + CRCtriplet(crc, next, -14); + /* fallthrough */ + case 13: + CRCtriplet(crc, next, -13); + /* fallthrough */ + case 12: + CRCtriplet(crc, next, -12); + /* fallthrough */ + case 11: + CRCtriplet(crc, next, -11); + /* fallthrough */ + case 10: + CRCtriplet(crc, next, -10); + /* fallthrough */ + case 9: + CRCtriplet(crc, next, -9); + /* fallthrough */ + case 8: + CRCtriplet(crc, next, -8); + /* fallthrough */ + case 7: + CRCtriplet(crc, next, -7); + /* fallthrough */ + case 6: + CRCtriplet(crc, next, -6); + /* fallthrough */ + case 5: + CRCtriplet(crc, next, -5); + /* fallthrough */ + case 4: + CRCtriplet(crc, next, -4); + /* fallthrough */ + case 3: + CRCtriplet(crc, next, -3); + /* fallthrough */ + case 2: + CRCtriplet(crc, next, -2); + /* fallthrough */ + case 1: + CRCduplet(crc, next, -1); // the final triplet is actually only 2 + //{ CombineCRC(); } + crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2); + if (--n > 0) { + crc1 = crc2 = 0; + block_size = 128; + // points to the first byte of the next block + next0 = next2 + 128; + next1 = next0 + 128; // from here on all blocks are 128 long + next2 = next1 + 128; + } + /* fallthrough */ + case 0:; + } while (n > 0); + } + next = (const unsigned char*)next2; + } + uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets + len = len & 7; + next += (count2 * 8); + switch (count2) { + case 27: + CRCsinglet(crc0, next, -27 * 8); + /* fallthrough */ + case 26: + CRCsinglet(crc0, next, -26 * 8); + /* fallthrough */ + case 25: + CRCsinglet(crc0, next, -25 * 8); + /* fallthrough */ + case 24: + CRCsinglet(crc0, next, -24 * 8); + /* fallthrough */ + case 23: + CRCsinglet(crc0, next, -23 * 8); + /* fallthrough */ + case 22: + CRCsinglet(crc0, next, -22 * 8); + /* fallthrough */ + case 21: + CRCsinglet(crc0, next, -21 * 8); + /* fallthrough */ + case 20: + CRCsinglet(crc0, next, -20 * 8); + /* fallthrough */ + case 19: + CRCsinglet(crc0, next, -19 * 8); + /* fallthrough */ + case 18: + CRCsinglet(crc0, next, -18 * 8); + /* fallthrough */ + case 17: + CRCsinglet(crc0, next, -17 * 8); + /* fallthrough */ + case 16: + CRCsinglet(crc0, next, -16 * 8); + /* fallthrough */ + case 15: + CRCsinglet(crc0, next, -15 * 8); + /* fallthrough */ + case 14: + CRCsinglet(crc0, next, -14 * 8); + /* fallthrough */ + case 13: + CRCsinglet(crc0, next, -13 * 8); + /* fallthrough */ + case 12: + CRCsinglet(crc0, next, -12 * 8); + /* fallthrough */ + case 11: + CRCsinglet(crc0, next, -11 * 8); + /* fallthrough */ + case 10: + CRCsinglet(crc0, next, -10 * 8); + /* fallthrough */ + case 9: + CRCsinglet(crc0, next, -9 * 8); + /* fallthrough */ + case 8: + CRCsinglet(crc0, next, -8 * 8); + /* fallthrough */ + case 7: + CRCsinglet(crc0, next, -7 * 8); + /* fallthrough */ + case 6: + CRCsinglet(crc0, next, -6 * 8); + /* fallthrough */ + case 5: + CRCsinglet(crc0, next, -5 * 8); + /* fallthrough */ + case 4: + CRCsinglet(crc0, next, -4 * 8); + /* fallthrough */ + case 3: + CRCsinglet(crc0, next, -3 * 8); + /* fallthrough */ + case 2: + CRCsinglet(crc0, next, -2 * 8); + /* fallthrough */ + case 1: + CRCsinglet(crc0, next, -1 * 8); + /* fallthrough */ + case 0:; + } + } + { + align_to_8(len, crc0, next); + return (uint32_t)crc0 ^ 0xffffffffu; + } +} + +#else +#define NO_THREEWAY_CRC32C +#endif //HAVE_SSE42 && HAVE_PCLMUL + +static inline Function Choose_Extend() { +#ifdef HAVE_POWER8 + return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>; +#elif defined(HAVE_ARMV8_CRC) + if(crc32c_aarch64_available()) { + return ExtendARMImpl; + } else { + return ExtendImpl<Slow_CRC32>; + } +#else + if (isSSE42()) { + if (isPCLMULQDQ()) { +#if defined HAVE_SSE42 && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C + return crc32c_3way; +#else + return ExtendImpl<Fast_CRC32>; // Fast_CRC32 will check HAVE_SSE42 itself +#endif + } + else { // no runtime PCLMULQDQ support but has SSE42 support + return ExtendImpl<Fast_CRC32>; + } + } // end of isSSE42() + else { + return ExtendImpl<Slow_CRC32>; + } +#endif +} + +static const Function ChosenExtend = Choose_Extend(); + +static inline uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + return ChosenExtend(crc, buf, size); +} +} // namespace crc32c +} // namespace mysys_namespace + +extern "C" unsigned int my_crc32c(unsigned int crc, const char *buf, size_t size) +{ + return mysys_namespace::crc32c::Extend(crc,buf, size); +} diff --git a/mysys/crc32/crc32c_ppc.c b/mysys/crc32/crc32c_ppc.c new file mode 100644 index 00000000000..72f24283454 --- /dev/null +++ b/mysys/crc32/crc32c_ppc.c @@ -0,0 +1,5 @@ +#define CRC32_FUNCTION crc32c_ppc +#define CRC_TABLE +#define POWER8_INTRINSICS +#include "pcc_crc32c_constants.h" +#include "crc_ppc64.h" diff --git a/mysys/crc32/crc32c_ppc.h b/mysys/crc32/crc32c_ppc.h new file mode 100644 index 00000000000..c359061c610 --- /dev/null +++ b/mysys/crc32/crc32c_ppc.h @@ -0,0 +1,19 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer, + unsigned len); + +#ifdef __cplusplus +} +#endif diff --git a/mysys/crc32/crc_ppc64.h b/mysys/crc32/crc_ppc64.h new file mode 100644 index 00000000000..eb9379abc6c --- /dev/null +++ b/mysys/crc32/crc_ppc64.h @@ -0,0 +1,664 @@ +/* + * Calculate the checksum of data that is 16 byte aligned and a multiple of + * 16 bytes. + * + * The first step is to reduce it to 1024 bits. We do this in 8 parallel + * chunks in order to mask the latency of the vpmsum instructions. If we + * have more than 32 kB of data to checksum we repeat this step multiple + * times, passing in the previous 1024 bits. + * + * The next step is to reduce the 1024 bits to 64 bits. This step adds + * 32 bits of 0s to the end - this matches what a CRC does. We just + * calculate constants that land the data in this 32 bits. + * + * We then use fixed point Barrett reduction to compute a mod n over GF(2) + * for n = CRC using POWER8 instructions. We use x = 32. + * + * http://en.wikipedia.org/wiki/Barrett_reduction + * + * This code uses gcc vector builtins instead using assembly directly. + * + * Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of either: + * + * a) the GNU General Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your option) + * any later version, or + * b) the Apache License, Version 2.0 + */ + +#include <altivec.h> + + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +#ifdef REFLECT +static unsigned int crc32_align(unsigned int crc, const unsigned char *p, + unsigned long len) +{ + while (len--) + crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); + return crc; +} +#else +static unsigned int crc32_align(unsigned int crc, const unsigned char *p, + unsigned long len) +{ + while (len--) + crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8); + return crc; +} +#endif + +static unsigned int __attribute__ ((aligned (32))) +__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len); + + +unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p, + unsigned long len) +{ + unsigned int prealign; + unsigned int tail; + +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + if (len < VMX_ALIGN + VMX_ALIGN_MASK) { + crc = crc32_align(crc, p, len); + goto out; + } + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); + crc = crc32_align(crc, p, prealign); + len -= prealign; + p += prealign; + } + + crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += len & ~VMX_ALIGN_MASK; + crc = crc32_align(crc, p, tail); + } + +out: +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + return crc; +} + +#if defined (__clang__) +#include "clang_workaround.h" +#else +#define __builtin_pack_vector(a, b) __builtin_pack_vector_int128 ((a), (b)) +#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0) +#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1) +#endif + +/* When we have a load-store in a single-dispatch group and address overlap + * such that foward is not allowed (load-hit-store) the group must be flushed. + * A group ending NOP prevents the flush. + */ +#define GROUP_ENDING_NOP asm("ori 2,2,0" ::: "memory") + +#if defined(__BIG_ENDIAN__) && defined (REFLECT) +#define BYTESWAP_DATA +#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) +#define BYTESWAP_DATA +#endif + +#ifdef BYTESWAP_DATA +#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb,\ + (__vector unsigned char) vc) +#if defined(__LITTLE_ENDIAN__) +/* Byte reverse permute constant LE. */ +static const __vector unsigned long long vperm_const + __attribute__ ((aligned(16))) = { 0x08090A0B0C0D0E0FUL, + 0x0001020304050607UL }; +#else +static const __vector unsigned long long vperm_const + __attribute__ ((aligned(16))) = { 0x0F0E0D0C0B0A0908UL, + 0X0706050403020100UL }; +#endif +#else +#define VEC_PERM(vr, va, vb, vc) +#endif + +static unsigned int __attribute__ ((aligned (32))) +__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) { + + const __vector unsigned long long vzero = {0,0}; + const __vector unsigned long long vones = {0xffffffffffffffffUL, + 0xffffffffffffffffUL}; + +#ifdef REFLECT + __vector unsigned char vsht_splat; + const __vector unsigned long long vmask_32bit = + (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, + (__vector unsigned char)vones, 4); +#endif + + const __vector unsigned long long vmask_64bit = + (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, + (__vector unsigned char)vones, 8); + + __vector unsigned long long vcrc; + + __vector unsigned long long vconst1, vconst2; + + /* vdata0-vdata7 will contain our data (p). */ + __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, + vdata5, vdata6, vdata7; + + /* v0-v7 will contain our checksums */ + __vector unsigned long long v0 = {0,0}; + __vector unsigned long long v1 = {0,0}; + __vector unsigned long long v2 = {0,0}; + __vector unsigned long long v3 = {0,0}; + __vector unsigned long long v4 = {0,0}; + __vector unsigned long long v5 = {0,0}; + __vector unsigned long long v6 = {0,0}; + __vector unsigned long long v7 = {0,0}; + + + /* Vector auxiliary variables. */ + __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7; + + unsigned int result = 0; + unsigned int offset; /* Constant table offset. */ + + unsigned long i; /* Counter. */ + unsigned long chunks; + + unsigned long block_size; + int next_block = 0; + + /* Align by 128 bits. The last 128 bit block will be processed at end. */ + unsigned long length = len & 0xFFFFFFFFFFFFFF80UL; + +#ifdef REFLECT + vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc); +#else + vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL); + + /* Shift into top 32 bits */ + vcrc = (__vector unsigned long long)vec_sld((__vector unsigned char)vcrc, + (__vector unsigned char)vzero, 4); +#endif + + /* Short version. */ + if (len < 256) { + /* Calculate where in the constant table we need to start. */ + offset = 256 - len; + + vconst1 = vec_ld(offset, vcrc_short_const); + vdata0 = vec_ld(0, (__vector unsigned long long*) p); + VEC_PERM(vdata0, vdata0, vconst1, vperm_const); + + /* xor initial value*/ + vdata0 = vec_xor(vdata0, vcrc); + + vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw + ((__vector unsigned int)vdata0, (__vector unsigned int)vconst1); + v0 = vec_xor(v0, vdata0); + + for (i = 16; i < len; i += 16) { + vconst1 = vec_ld(offset + i, vcrc_short_const); + vdata0 = vec_ld(i, (__vector unsigned long long*) p); + VEC_PERM(vdata0, vdata0, vconst1, vperm_const); + vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw + ((__vector unsigned int)vdata0, (__vector unsigned int)vconst1); + v0 = vec_xor(v0, vdata0); + } + } else { + + /* Load initial values. */ + vdata0 = vec_ld(0, (__vector unsigned long long*) p); + vdata1 = vec_ld(16, (__vector unsigned long long*) p); + + VEC_PERM(vdata0, vdata0, vdata0, vperm_const); + VEC_PERM(vdata1, vdata1, vdata1, vperm_const); + + vdata2 = vec_ld(32, (__vector unsigned long long*) p); + vdata3 = vec_ld(48, (__vector unsigned long long*) p); + + VEC_PERM(vdata2, vdata2, vdata2, vperm_const); + VEC_PERM(vdata3, vdata3, vdata3, vperm_const); + + vdata4 = vec_ld(64, (__vector unsigned long long*) p); + vdata5 = vec_ld(80, (__vector unsigned long long*) p); + + VEC_PERM(vdata4, vdata4, vdata4, vperm_const); + VEC_PERM(vdata5, vdata5, vdata5, vperm_const); + + vdata6 = vec_ld(96, (__vector unsigned long long*) p); + vdata7 = vec_ld(112, (__vector unsigned long long*) p); + + VEC_PERM(vdata6, vdata6, vdata6, vperm_const); + VEC_PERM(vdata7, vdata7, vdata7, vperm_const); + + /* xor in initial value */ + vdata0 = vec_xor(vdata0, vcrc); + + p = (char *)p + 128; + + do { + /* Checksum in blocks of MAX_SIZE. */ + block_size = length; + if (block_size > MAX_SIZE) { + block_size = MAX_SIZE; + } + + length = length - block_size; + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + offset = (MAX_SIZE/8) - (block_size/8); + /* We reduce our final 128 bytes in a separate step */ + chunks = (block_size/128)-1; + + vconst1 = vec_ld(offset, vcrc_const); + + va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0, + (__vector unsigned long long)vconst1); + va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1, + (__vector unsigned long long)vconst1); + va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2, + (__vector unsigned long long)vconst1); + va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3, + (__vector unsigned long long)vconst1); + va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4, + (__vector unsigned long long)vconst1); + va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5, + (__vector unsigned long long)vconst1); + va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6, + (__vector unsigned long long)vconst1); + va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7, + (__vector unsigned long long)vconst1); + + if (chunks > 1) { + offset += 16; + vconst2 = vec_ld(offset, vcrc_const); + GROUP_ENDING_NOP; + + vdata0 = vec_ld(0, (__vector unsigned long long*) p); + VEC_PERM(vdata0, vdata0, vdata0, vperm_const); + + vdata1 = vec_ld(16, (__vector unsigned long long*) p); + VEC_PERM(vdata1, vdata1, vdata1, vperm_const); + + vdata2 = vec_ld(32, (__vector unsigned long long*) p); + VEC_PERM(vdata2, vdata2, vdata2, vperm_const); + + vdata3 = vec_ld(48, (__vector unsigned long long*) p); + VEC_PERM(vdata3, vdata3, vdata3, vperm_const); + + vdata4 = vec_ld(64, (__vector unsigned long long*) p); + VEC_PERM(vdata4, vdata4, vdata4, vperm_const); + + vdata5 = vec_ld(80, (__vector unsigned long long*) p); + VEC_PERM(vdata5, vdata5, vdata5, vperm_const); + + vdata6 = vec_ld(96, (__vector unsigned long long*) p); + VEC_PERM(vdata6, vdata6, vdata6, vperm_const); + + vdata7 = vec_ld(112, (__vector unsigned long long*) p); + VEC_PERM(vdata7, vdata7, vdata7, vperm_const); + + p = (char *)p + 128; + + /* + * main loop. We modulo schedule it such that it takes three + * iterations to complete - first iteration load, second + * iteration vpmsum, third iteration xor. + */ + for (i = 0; i < chunks-2; i++) { + vconst1 = vec_ld(offset, vcrc_const); + offset += 16; + GROUP_ENDING_NOP; + + v0 = vec_xor(v0, va0); + va0 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata0, (__vector unsigned long long)vconst2); + vdata0 = vec_ld(0, (__vector unsigned long long*) p); + VEC_PERM(vdata0, vdata0, vdata0, vperm_const); + GROUP_ENDING_NOP; + + v1 = vec_xor(v1, va1); + va1 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata1, (__vector unsigned long long)vconst2); + vdata1 = vec_ld(16, (__vector unsigned long long*) p); + VEC_PERM(vdata1, vdata1, vdata1, vperm_const); + GROUP_ENDING_NOP; + + v2 = vec_xor(v2, va2); + va2 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata2, (__vector unsigned long long)vconst2); + vdata2 = vec_ld(32, (__vector unsigned long long*) p); + VEC_PERM(vdata2, vdata2, vdata2, vperm_const); + GROUP_ENDING_NOP; + + v3 = vec_xor(v3, va3); + va3 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata3, (__vector unsigned long long)vconst2); + vdata3 = vec_ld(48, (__vector unsigned long long*) p); + VEC_PERM(vdata3, vdata3, vdata3, vperm_const); + + vconst2 = vec_ld(offset, vcrc_const); + GROUP_ENDING_NOP; + + v4 = vec_xor(v4, va4); + va4 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata4, (__vector unsigned long long)vconst1); + vdata4 = vec_ld(64, (__vector unsigned long long*) p); + VEC_PERM(vdata4, vdata4, vdata4, vperm_const); + GROUP_ENDING_NOP; + + v5 = vec_xor(v5, va5); + va5 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata5, (__vector unsigned long long)vconst1); + vdata5 = vec_ld(80, (__vector unsigned long long*) p); + VEC_PERM(vdata5, vdata5, vdata5, vperm_const); + GROUP_ENDING_NOP; + + v6 = vec_xor(v6, va6); + va6 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata6, (__vector unsigned long long)vconst1); + vdata6 = vec_ld(96, (__vector unsigned long long*) p); + VEC_PERM(vdata6, vdata6, vdata6, vperm_const); + GROUP_ENDING_NOP; + + v7 = vec_xor(v7, va7); + va7 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata7, (__vector unsigned long long)vconst1); + vdata7 = vec_ld(112, (__vector unsigned long long*) p); + VEC_PERM(vdata7, vdata7, vdata7, vperm_const); + + p = (char *)p + 128; + } + + /* First cool down*/ + vconst1 = vec_ld(offset, vcrc_const); + offset += 16; + + v0 = vec_xor(v0, va0); + va0 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata0, (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v1 = vec_xor(v1, va1); + va1 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata1, (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v2 = vec_xor(v2, va2); + va2 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata2, (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v3 = vec_xor(v3, va3); + va3 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata3, (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v4 = vec_xor(v4, va4); + va4 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata4, (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v5 = vec_xor(v5, va5); + va5 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata5, (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v6 = vec_xor(v6, va6); + va6 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata6, (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v7 = vec_xor(v7, va7); + va7 = __builtin_crypto_vpmsumd ((__vector unsigned long + long)vdata7, (__vector unsigned long long)vconst1); + }/* else */ + + /* Second cool down. */ + v0 = vec_xor(v0, va0); + v1 = vec_xor(v1, va1); + v2 = vec_xor(v2, va2); + v3 = vec_xor(v3, va3); + v4 = vec_xor(v4, va4); + v5 = vec_xor(v5, va5); + v6 = vec_xor(v6, va6); + v7 = vec_xor(v7, va7); + +#ifdef REFLECT + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, + (__vector unsigned char)vzero, 4); + v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1, + (__vector unsigned char)vzero, 4); + v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2, + (__vector unsigned char)vzero, 4); + v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3, + (__vector unsigned char)vzero, 4); + v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4, + (__vector unsigned char)vzero, 4); + v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5, + (__vector unsigned char)vzero, 4); + v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6, + (__vector unsigned char)vzero, 4); + v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7, + (__vector unsigned char)vzero, 4); +#endif + + /* xor with the last 1024 bits. */ + va0 = vec_ld(0, (__vector unsigned long long*) p); + VEC_PERM(va0, va0, va0, vperm_const); + + va1 = vec_ld(16, (__vector unsigned long long*) p); + VEC_PERM(va1, va1, va1, vperm_const); + + va2 = vec_ld(32, (__vector unsigned long long*) p); + VEC_PERM(va2, va2, va2, vperm_const); + + va3 = vec_ld(48, (__vector unsigned long long*) p); + VEC_PERM(va3, va3, va3, vperm_const); + + va4 = vec_ld(64, (__vector unsigned long long*) p); + VEC_PERM(va4, va4, va4, vperm_const); + + va5 = vec_ld(80, (__vector unsigned long long*) p); + VEC_PERM(va5, va5, va5, vperm_const); + + va6 = vec_ld(96, (__vector unsigned long long*) p); + VEC_PERM(va6, va6, va6, vperm_const); + + va7 = vec_ld(112, (__vector unsigned long long*) p); + VEC_PERM(va7, va7, va7, vperm_const); + + p = (char *)p + 128; + + vdata0 = vec_xor(v0, va0); + vdata1 = vec_xor(v1, va1); + vdata2 = vec_xor(v2, va2); + vdata3 = vec_xor(v3, va3); + vdata4 = vec_xor(v4, va4); + vdata5 = vec_xor(v5, va5); + vdata6 = vec_xor(v6, va6); + vdata7 = vec_xor(v7, va7); + + /* Check if we have more blocks to process */ + next_block = 0; + if (length != 0) { + next_block = 1; + + /* zero v0-v7 */ + v0 = vec_xor(v0, v0); + v1 = vec_xor(v1, v1); + v2 = vec_xor(v2, v2); + v3 = vec_xor(v3, v3); + v4 = vec_xor(v4, v4); + v5 = vec_xor(v5, v5); + v6 = vec_xor(v6, v6); + v7 = vec_xor(v7, v7); + } + length = length + 128; + + } while (next_block); + + /* Calculate how many bytes we have left. */ + length = (len & 127); + + /* Calculate where in (short) constant table we need to start. */ + offset = 128 - length; + + v0 = vec_ld(offset, vcrc_short_const); + v1 = vec_ld(offset + 16, vcrc_short_const); + v2 = vec_ld(offset + 32, vcrc_short_const); + v3 = vec_ld(offset + 48, vcrc_short_const); + v4 = vec_ld(offset + 64, vcrc_short_const); + v5 = vec_ld(offset + 80, vcrc_short_const); + v6 = vec_ld(offset + 96, vcrc_short_const); + v7 = vec_ld(offset + 112, vcrc_short_const); + + offset += 128; + + v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( + (__vector unsigned int)vdata0,(__vector unsigned int)v0); + v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( + (__vector unsigned int)vdata1,(__vector unsigned int)v1); + v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( + (__vector unsigned int)vdata2,(__vector unsigned int)v2); + v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( + (__vector unsigned int)vdata3,(__vector unsigned int)v3); + v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( + (__vector unsigned int)vdata4,(__vector unsigned int)v4); + v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( + (__vector unsigned int)vdata5,(__vector unsigned int)v5); + v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( + (__vector unsigned int)vdata6,(__vector unsigned int)v6); + v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( + (__vector unsigned int)vdata7,(__vector unsigned int)v7); + + /* Now reduce the tail (0-112 bytes). */ + for (i = 0; i < length; i+=16) { + vdata0 = vec_ld(i,(__vector unsigned long long*)p); + VEC_PERM(vdata0, vdata0, vdata0, vperm_const); + va0 = vec_ld(offset + i,vcrc_short_const); + va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( + (__vector unsigned int)vdata0,(__vector unsigned int)va0); + v0 = vec_xor(v0, va0); + } + + /* xor all parallel chunks together. */ + v0 = vec_xor(v0, v1); + v2 = vec_xor(v2, v3); + v4 = vec_xor(v4, v5); + v6 = vec_xor(v6, v7); + + v0 = vec_xor(v0, v2); + v4 = vec_xor(v4, v6); + + v0 = vec_xor(v0, v4); + } + + /* Barrett Reduction */ + vconst1 = vec_ld(0, v_Barrett_const); + vconst2 = vec_ld(16, v_Barrett_const); + + v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, + (__vector unsigned char)v0, 8); + v0 = vec_xor(v1,v0); + +#ifdef REFLECT + /* shift left one bit */ + vsht_splat = vec_splat_u8 (1); + v0 = (__vector unsigned long long)vec_sll ((__vector unsigned char)v0, + vsht_splat); +#endif + + v0 = vec_and(v0, vmask_64bit); + +#ifndef REFLECT + + /* + * Now for the actual algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + + /* ma */ + v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0, + (__vector unsigned long long)vconst1); + /* q = floor(ma/(2^64)) */ + v1 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero, + (__vector unsigned char)v1, 8); + /* qn */ + v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, + (__vector unsigned long long)vconst2); + /* a - qn, subtraction is xor in GF(2) */ + v0 = vec_xor (v0, v1); + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + result = __builtin_unpack_vector_1 (v0); +#else + + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + + /* bottom 32 bits of a */ + v1 = vec_and(v0, vmask_32bit); + + /* ma */ + v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, + (__vector unsigned long long)vconst1); + + /* bottom 32bits of ma */ + v1 = vec_and(v1, vmask_32bit); + /* qn */ + v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, + (__vector unsigned long long)vconst2); + /* a - qn, subtraction is xor in GF(2) */ + v0 = vec_xor (v0, v1); + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + + /* shift result into top 64 bits of */ + v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, + (__vector unsigned char)vzero, 4); + + result = __builtin_unpack_vector_0 (v0); +#endif + + return result; +} diff --git a/mysys/checksum.c b/mysys/crc32ieee.cc index 948b9be6164..5f8344b4f9d 100644 --- a/mysys/checksum.c +++ b/mysys/crc32ieee.cc @@ -18,41 +18,46 @@ #include <my_sys.h> #include <zlib.h> -#if !defined(HAVE_CRC32_VPMSUM) /* TODO: remove this once zlib adds inherent support for hardware accelerated crc32 for all architectures. */ static unsigned int my_crc32_zlib(unsigned int crc, const void *data, size_t len) { - return (unsigned int) crc32(crc, data, (unsigned int) len); + return (unsigned int) crc32(crc, (const Bytef *)data, (unsigned int) len); } -my_crc32_t my_checksum= my_crc32_zlib; +#ifdef HAVE_PCLMUL +extern "C" int crc32_pclmul_enabled(); +extern "C" unsigned int crc32_pclmul(unsigned int, const void *, size_t); +#elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC) +extern "C" int crc32_aarch64_available(); +extern "C" unsigned int crc32_aarch64(unsigned int, const void *, size_t); #endif -#if __GNUC__ >= 4 && defined(__x86_64__) -extern int crc32_pclmul_enabled(); -extern unsigned int crc32_pclmul(unsigned int, const void *, size_t); +typedef unsigned int (*my_crc32_t)(unsigned int, const void *, size_t); -/*----------------------------- x86_64 ---------------------------------*/ -void my_checksum_init(void) +static my_crc32_t init_crc32() { + my_crc32_t func= my_crc32_zlib; +#ifdef HAVE_PCLMUL if (crc32_pclmul_enabled()) - my_checksum= crc32_pclmul; -} + func = crc32_pclmul; #elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC) -/*----------------------------- aarch64 --------------------------------*/ + if (crc32_aarch64_available()) + func= crc32_aarch64; +#endif + return func; +} -extern unsigned int crc32_aarch64(unsigned int, const void *, size_t); +static const my_crc32_t my_checksum_func= init_crc32(); -/* Ideally all ARM 64 bit processor should support crc32 but if some model -doesn't support better to find it out through auxillary vector. */ -void my_checksum_init(void) +#ifndef __powerpc64__ +/* For powerpc, my_checksum is defined elsewhere.*/ +extern "C" unsigned int my_checksum(unsigned int crc, const void *data, size_t len) { - if (crc32_aarch64_available()) - my_checksum= crc32_aarch64; + return my_checksum_func(crc, data, len); } -#else -void my_checksum_init(void) {} #endif + + diff --git a/mysys/mf_iocache.c b/mysys/mf_iocache.c index 2e34cef5d19..75ff99b40a5 100644 --- a/mysys/mf_iocache.c +++ b/mysys/mf_iocache.c @@ -1,6 +1,6 @@ /* Copyright (c) 2000, 2011, Oracle and/or its affiliates - Copyright (c) 2010, 2015, MariaDB + Copyright (c) 2010, 2020, MariaDB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -769,7 +769,8 @@ int _my_b_cache_read(IO_CACHE *info, uchar *Buffer, size_t Count) info->read_pos=info->buffer+Count; info->read_end=info->buffer+length; info->pos_in_file=pos_in_file; - memcpy(Buffer, info->buffer, Count); + if (Count) + memcpy(Buffer, info->buffer, Count); DBUG_RETURN(0); } @@ -1270,7 +1271,8 @@ static int _my_b_cache_read_r(IO_CACHE *cache, uchar *Buffer, size_t Count) DBUG_RETURN(1); } cnt= (len > Count) ? Count : len; - memcpy(Buffer, cache->read_pos, cnt); + if (cnt) + memcpy(Buffer, cache->read_pos, cnt); Count -= cnt; Buffer+= cnt; left_length+= cnt; diff --git a/mysys/my_alloc.c b/mysys/my_alloc.c index b4a63e93be3..d7e62726b22 100644 --- a/mysys/my_alloc.c +++ b/mysys/my_alloc.c @@ -1,5 +1,6 @@ /* Copyright (c) 2000, 2010, Oracle and/or its affiliates + Copyright (c) 2010, 2020, MariaDB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -465,7 +466,8 @@ char *strmake_root(MEM_ROOT *root, const char *str, size_t len) char *pos; if ((pos=alloc_root(root,len+1))) { - memcpy(pos,str,len); + if (len) + memcpy(pos,str,len); pos[len]=0; } return pos; diff --git a/mysys/my_init.c b/mysys/my_init.c index cd9875017f0..2b420da03be 100644 --- a/mysys/my_init.c +++ b/mysys/my_init.c @@ -100,9 +100,6 @@ my_bool my_init(void) /* Initialize our mutex handling */ my_mutex_init(); - /* Initialize CPU architecture specific hardware based crc32 optimization */ - my_checksum_init(); - if (my_thread_global_init()) return 1; diff --git a/mysys/my_rename.c b/mysys/my_rename.c index 9f0770e8140..7b31e83be20 100644 --- a/mysys/my_rename.c +++ b/mysys/my_rename.c @@ -19,8 +19,62 @@ #include "m_string.h" #undef my_rename - /* On unix rename deletes to file if it exists */ +#ifdef _WIN32 + +#define RENAME_MAX_RETRIES 50 + +/* + On Windows, bad 3rd party programs (backup or anitivirus, or something else) + can have file open with a sharing mode incompatible with renaming, i.e they + won't use FILE_SHARE_DELETE when opening file. + + The following function will do a couple of retries, in case MoveFileEx returns + ERROR_SHARING_VIOLATION. +*/ +static BOOL win_rename_with_retries(const char *from, const char *to) +{ +#ifndef DBUG_OFF + FILE *fp = NULL; + DBUG_EXECUTE_IF("rename_sharing_violation", + { + fp= fopen(from, "r"); + DBUG_ASSERT(fp); + } + ); +#endif + + for (int retry= RENAME_MAX_RETRIES; retry--;) + { + DWORD ret = MoveFileEx(from, to, + MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING); + + DBUG_ASSERT(fp == NULL || (ret == FALSE && GetLastError() == ERROR_SHARING_VIOLATION)); + + if (!ret && (GetLastError() == ERROR_SHARING_VIOLATION)) + { +#ifndef DBUG_OFF + /* + If error was injected in via DBUG_EXECUTE_IF, close the file + that is causing ERROR_SHARING_VIOLATION, so that retry succeeds. + */ + if (fp) + { + fclose(fp); + fp= NULL; + } +#endif + + Sleep(10); + } + else + return ret; + } + return FALSE; +} +#endif + + /* On unix rename deletes to file if it exists */ int my_rename(const char *from, const char *to, myf MyFlags) { int error = 0; @@ -28,8 +82,7 @@ int my_rename(const char *from, const char *to, myf MyFlags) DBUG_PRINT("my",("from %s to %s MyFlags %lu", from, to, MyFlags)); #if defined(__WIN__) - if (!MoveFileEx(from, to, MOVEFILE_COPY_ALLOWED | - MOVEFILE_REPLACE_EXISTING)) + if (!win_rename_with_retries(from, to)) { my_osmaperr(GetLastError()); #elif defined(HAVE_RENAME) |