diff options
author | Marko Mäkelä <marko.makela@mariadb.com> | 2021-04-14 12:32:27 +0300 |
---|---|---|
committer | Marko Mäkelä <marko.makela@mariadb.com> | 2021-04-14 12:32:27 +0300 |
commit | d2e2d32933823623fa3598c8e2b8a5a322e435bb (patch) | |
tree | 4a0094ff26be1e985281ef008433ce1493b58ae7 /mysys | |
parent | 72e0601d11ac40a27ce071cba8626612bc625e3c (diff) | |
parent | 6c3e860cbf36831c118f6ea183acbbeb3c889bed (diff) | |
download | mariadb-git-d2e2d32933823623fa3598c8e2b8a5a322e435bb.tar.gz |
Merge 10.5 into 10.6
Diffstat (limited to 'mysys')
-rw-r--r-- | mysys/CMakeLists.txt | 32 | ||||
-rw-r--r-- | mysys/crc32/crc32_x86.c | 28 | ||||
-rw-r--r-- | mysys/crc32/crc32c.cc | 955 | ||||
-rw-r--r-- | mysys/crc32/crc32c_amd64.cc | 711 | ||||
-rw-r--r-- | mysys/crc32ieee.cc | 20 |
5 files changed, 879 insertions, 867 deletions
diff --git a/mysys/CMakeLists.txt b/mysys/CMakeLists.txt index d2d740aba17..5a4eeeba603 100644 --- a/mysys/CMakeLists.txt +++ b/mysys/CMakeLists.txt @@ -16,7 +16,7 @@ INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/mysys) -SET(MYSYS_SOURCES array.c charset-def.c charset.c crc32ieee.cc my_default.c +SET(MYSYS_SOURCES array.c charset-def.c charset.c my_default.c get_password.c errors.c hash.c list.c mf_cache.c mf_dirname.c mf_fn_ext.c @@ -60,19 +60,29 @@ ENDIF() IF(MSVC) SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c) + IF(CMAKE_SIZEOF_VOID_P EQUAL 8) + SET (MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32c_amd64.cc) + ENDIF() ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL) IF(CLANG_CL) - SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.cc crc32/crc32c.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") + SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") ENDIF() ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686") - MY_CHECK_C_COMPILER_FLAG(-msse4.2) - MY_CHECK_C_COMPILER_FLAG(-mpclmul) + MY_CHECK_CXX_COMPILER_FLAG(-msse4.2) + MY_CHECK_CXX_COMPILER_FLAG(-mpclmul) CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H) CHECK_INCLUDE_FILE(x86intrin.h HAVE_X86INTRIN_H) - IF(have_C__msse4.2 AND have_C__mpclmul AND HAVE_CPUID_H AND HAVE_X86INTRIN_H) - SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c) - SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c crc32/crc32c.cc PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") - ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL) + IF(have_CXX__msse4.2 AND HAVE_CPUID_H) + ADD_DEFINITIONS(-DHAVE_SSE42) + IF (have_CXX__mpclmul AND HAVE_X86INTRIN_H) + ADD_DEFINITIONS(-DHAVE_PCLMUL) + SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c) + SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") + IF(CMAKE_SIZEOF_VOID_P EQUAL 8) + SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32c_amd64.cc) + SET_SOURCE_FILES_PROPERTIES(crc32/crc32c_amd64.cc PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") + ENDIF() + ENDIF() ENDIF() ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") IF(CMAKE_COMPILER_IS_GNUCC) @@ -129,11 +139,15 @@ ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") COMPILE_FLAGS "-march=armv8-a+crc+crypto") ENDIF() ENDIF() -ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64|powerpc64" OR CMAKE_SYSTEM_NAME MATCHES AIX) +ENDIF() + +IF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64|powerpc64" OR CMAKE_SYSTEM_NAME MATCHES AIX) SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_ppc64.c crc32/crc32c_ppc.c) SET_SOURCE_FILES_PROPERTIES(crc32/crc32_ppc64.c crc32/crc32c_ppc.c PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -maltivec -mvsx -mpower8-vector -mcrypto -mpower8-vector") ADD_DEFINITIONS(-DHAVE_POWER8 -DHAS_ALTIVEC) +ELSE() + SET (MYSYS_SOURCES ${MYSYS_SOURCES} crc32ieee.cc) ENDIF() IF(UNIX) diff --git a/mysys/crc32/crc32_x86.c b/mysys/crc32/crc32_x86.c index 1e5d2a0a089..f077399caca 100644 --- a/mysys/crc32/crc32_x86.c +++ b/mysys/crc32/crc32_x86.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 MariaDB +/* Copyright (c) 2020, 2021, MariaDB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -55,38 +55,14 @@ #include <stdint.h> #include <stddef.h> -#if defined(__GNUC__) +#ifdef __GNUC__ #include <x86intrin.h> -#include <cpuid.h> #elif defined(_MSC_VER) #include <intrin.h> #else #error "unknown compiler" #endif -static int has_sse42_and_pclmul(uint32_t recx) -{ - /* 1 << 20 is SSE42, 1 << 1 is PCLMULQDQ */ -#define bits_SSE42_AND_PCLMUL (1 << 20 | 1 << 1) - return (recx & bits_SSE42_AND_PCLMUL) == bits_SSE42_AND_PCLMUL; -} - -#ifdef __GNUC__ -int crc32_pclmul_enabled(void) -{ - uint32_t reax= 0, rebx= 0, recx= 0, redx= 0; - __cpuid(1, reax, rebx, recx, redx); - return has_sse42_and_pclmul(recx); -} -#elif defined(_MSC_VER) -int crc32_pclmul_enabled(void) -{ - int regs[4]; - __cpuid(regs, 1); - return has_sse42_and_pclmul(regs[2]); -} -#endif - /** * @brief Shifts left 128 bit register by specified number of bytes * diff --git a/mysys/crc32/crc32c.cc b/mysys/crc32/crc32c.cc index b6c80886ec1..082d467e7da 100644 --- a/mysys/crc32/crc32c.cc +++ b/mysys/crc32/crc32c.cc @@ -32,11 +32,20 @@ static inline uint32_t DecodeFixed32(const char *ptr) #endif #ifdef HAVE_SSE42 -#include <nmmintrin.h> -#include <wmmintrin.h> -#ifdef __GNUC__ -#include <cpuid.h> -#endif +# ifdef __GNUC__ +# include <cpuid.h> +# if __GNUC__ < 5 && !defined __clang__ +/* the headers do not really work in GCC before version 5 */ +# define _mm_crc32_u8(crc,data) __builtin_ia32_crc32qi(crc,data) +# define _mm_crc32_u32(crc,data) __builtin_ia32_crc32si(crc,data) +# define _mm_crc32_u64(crc,data) __builtin_ia32_crc32di(crc,data) +# else +# include <nmmintrin.h> +# endif +# define USE_SSE42 __attribute__((target("sse4.2"))) +# else +# define USE_SSE42 /* nothing */ +# endif #endif @@ -337,19 +346,8 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) { return DecodeFixed32(reinterpret_cast<const char*>(p)); } -#if defined(HAVE_SSE42) && (SIZEOF_SIZE_T == 8) - -static inline uint64_t DecodeFixed64(const char *ptr) +static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { - return uint8korr(ptr); -} - -static inline uint64_t LE_LOAD64(const uint8_t *p) { - return DecodeFixed64(reinterpret_cast<const char*>(p)); -} -#endif - -static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { uint32_t c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p)); *p += 4; *l = table3_[c & 0xff] ^ @@ -365,27 +363,6 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { table0_[c >> 24]; } -__attribute__((unused)) static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) { -#ifndef HAVE_SSE42 - Slow_CRC32(l, p); -#elif (SIZEOF_SIZE_T == 8) - *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); - *p += 8; -#else - *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p)); - *p += 4; - *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p)); - *p += 4; -#endif -} - -template<void (*CRC32)(uint64_t*, uint8_t const**)> -uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { - - const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); - const uint8_t *e = p + size; - uint64_t l = crc ^ 0xffffffffu; - #ifdef ALIGN #undef ALIGN #endif @@ -398,70 +375,115 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { l = table0_[c] ^ (l >> 8); \ } while (0) +static uint32_t crc32c_slow(uint32_t crc, const char* buf, size_t size) +{ + const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); + const uint8_t *e = p + size; + uint64_t l = crc ^ 0xffffffffu; // Point x at first 16-byte aligned byte in string. This might be // just past the end of the string. const uintptr_t pval = reinterpret_cast<uintptr_t>(p); const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4)); - if (x <= e) { + if (x <= e) // Process bytes until finished or p is 16-byte aligned - while (p != x) { + while (p != x) STEP1; - } - } // Process bytes 16 at a time - while ((e-p) >= 16) { - CRC32(&l, &p); - CRC32(&l, &p); + while ((e-p) >= 16) + { + Slow_CRC32(&l, &p); + Slow_CRC32(&l, &p); } // Process bytes 8 at a time - while ((e-p) >= 8) { - CRC32(&l, &p); - } + while ((e-p) >= 8) + Slow_CRC32(&l, &p); // Process the last few bytes - while (p != e) { + while (p != e) STEP1; - } -#undef STEP1 -#undef ALIGN return static_cast<uint32_t>(l ^ 0xffffffffu); } -// Detect if ARM64 CRC or not. -#ifndef HAVE_ARMV8_CRC -// Detect if SS42 or not. -#ifndef HAVE_POWER8 +#if defined HAVE_POWER8 +#elif defined HAVE_ARMV8_CRC +#elif defined HAVE_SSE42 +constexpr uint32_t cpuid_ecx_SSE42= 1U << 20; +constexpr uint32_t cpuid_ecx_SSE42_AND_PCLMUL= cpuid_ecx_SSE42 | 1U<<1; -static bool isSSE42() { -#ifndef HAVE_SSE42 - return false; -#elif defined(__GNUC__) +static uint32_t cpuid_ecx() +{ +#ifdef __GNUC__ uint32_t reax= 0, rebx= 0, recx= 0, redx= 0; __cpuid(1, reax, rebx, recx, redx); - return (recx & ((int)1 << 20)) != 0; -#elif defined(_MSC_VER) - int info[4]; - __cpuid(info, 0x00000001); - return (info[2] & ((int)1 << 20)) != 0; + return recx; +#elif defined _MSC_VER + int regs[4]; + __cpuid(regs, 1); + return regs[2]; #else - return false; +# error "unknown compiler" #endif } -#ifdef HAVE_SSE42 -extern "C" int crc32_pclmul_enabled(); -#endif +extern "C" int crc32_pclmul_enabled(void) +{ + return !(~cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL); +} -static bool isPCLMULQDQ() { -#ifdef HAVE_SSE42 - return crc32_pclmul_enabled(); -#else - return false; +#if SIZEOF_SIZE_T == 8 +extern "C" uint32_t crc32c_3way(uint32_t crc, const char *buf, size_t len); + +USE_SSE42 +static inline uint64_t LE_LOAD64(const uint8_t *ptr) +{ + return uint8korr(reinterpret_cast<const char*>(ptr)); +} #endif + +USE_SSE42 +static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) +{ +# if (SIZEOF_SIZE_T == 8) + *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); + *p += 8; +# else + *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p)); + *p += 4; + *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p)); + *p += 4; +# endif } -#endif // HAVE_POWER8 -#endif // HAVE_ARMV8_CRC +USE_SSE42 +static uint32_t crc32c_sse42(uint32_t crc, const char* buf, size_t size) +{ + const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); + const uint8_t *e = p + size; + uint64_t l = crc ^ 0xffffffffu; + + // Point x at first 16-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast<uintptr_t>(p); + const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4)); + if (x <= e) + // Process bytes until finished or p is 16-byte aligned + while (p != x) + STEP1; + // Process bytes 16 at a time + while ((e-p) >= 16) + { + Fast_CRC32(&l, &p); + Fast_CRC32(&l, &p); + } + // Process bytes 8 at a time + while ((e-p) >= 8) + Fast_CRC32(&l, &p); + // Process the last few bytes + while (p != e) + STEP1; + return static_cast<uint32_t>(l ^ 0xffffffffu); +} +#endif typedef uint32_t (*Function)(uint32_t, const char*, size_t); @@ -507,14 +529,6 @@ static int arch_ppc_probe(void) { return arch_ppc_crc32; } #endif // __linux__ - -static bool isAltiVec() { - if (arch_ppc_probe()) { - return true; - } else { - return false; - } -} #endif #if defined(HAVE_ARMV8_CRC) @@ -526,760 +540,59 @@ static uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) { } #endif -extern "C" const char * my_crc32c_implementation() +static inline Function Choose_Extend() { -#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) +#if defined HAVE_POWER8 && defined HAS_ALTIVEC if (arch_ppc_probe()) - return "Using POWER8 crc32 instructions"; + return ExtendPPCImpl; #elif defined(HAVE_ARMV8_CRC) - const char *ret = crc32c_aarch64_available(); - if (ret) - return ret ; + if (crc32c_aarch64_available()) + return ExtendARMImpl; #elif HAVE_SSE42 - if (isSSE42()) - { - if (SIZEOF_SIZE_T == 8 && isPCLMULQDQ()) - return "Using crc32 + pclmulqdq instructions"; - return "Using SSE4.2 crc32 instructions"; +# if defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8 + switch (cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL) { + case cpuid_ecx_SSE42_AND_PCLMUL: + return crc32c_3way; + case cpuid_ecx_SSE42: + return crc32c_sse42; } +# else + if (cpuid_ecx() & cpuid_ecx_SSE42) + return crc32c_sse42; +# endif #endif - return "Using generic crc32 instructions"; -} - - -/* - * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the author be held liable for any damages - * arising from the use of this software. - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - * Ferry Toth - * ftoth@exalondelft.nl - * - * https://github.com/htot/crc32c - * - * Modified by Facebook - * - * Original intel whitepaper: - * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction" - * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf - * - * This version is from the folly library, created by Dave Watson <davejwatson@fb.com> - * -*/ -#if defined HAVE_SSE42 && defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8 - - -#define CRCtriplet(crc, buf, offset) \ - crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ - crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \ - crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset)); - -#define CRCduplet(crc, buf, offset) \ - crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ - crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); - -#define CRCsinglet(crc, buf, offset) \ - crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset)); - - -// Numbers taken directly from intel whitepaper. -// clang-format off -static const uint64_t clmul_constants[] = { - 0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6, - 0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e, - 0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da, - 0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8, - 0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296, - 0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2, - 0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6, - 0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092, - 0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0, - 0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456, - 0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e, - 0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a, - 0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574, - 0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832, - 0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124, - 0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86, - 0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e, - 0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a, - 0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46, - 0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a, - 0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a, - 0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4, - 0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56, - 0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2, - 0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c, - 0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac, - 0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64, - 0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e, - 0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c, - 0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28, - 0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26, - 0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c, - 0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c, - 0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c, - 0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4, - 0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844, - 0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c, - 0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730, - 0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c, - 0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2, - 0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2, - 0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e, - 0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a, - 0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a, - 0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a, - 0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768, - 0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4, - 0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c, - 0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba, - 0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312, - 0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544, - 0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a, - 0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e, - 0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a, - 0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c, - 0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a, - 0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6, - 0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca, - 0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888, - 0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e, - 0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528, - 0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a, - 0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e, - 0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa, -}; - -// Compute the crc32c value for buffer smaller than 8 -static inline void align_to_8( - size_t len, - uint64_t& crc0, // crc so far, updated on return - const unsigned char*& next) { // next data pointer, updated on return - uint32_t crc32bit = static_cast<uint32_t>(crc0); - if (len & 0x04) { - crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next); - next += sizeof(uint32_t); - } - if (len & 0x02) { - crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next); - next += sizeof(uint16_t); - } - if (len & 0x01) { - crc32bit = _mm_crc32_u8(crc32bit, *(next)); - next++; - } - crc0 = crc32bit; -} - -// -// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well -// chosen constant and xor's these with the remaining CRC. -// -static inline uint64_t CombineCRC( - size_t block_size, - uint64_t crc0, - uint64_t crc1, - uint64_t crc2, - const uint64_t* next2) { - const auto multiplier = - *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1); - const auto crc0_xmm = _mm_set_epi64x(0, crc0); - const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00); - const auto crc1_xmm = _mm_set_epi64x(0, crc1); - const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10); - const auto res = _mm_xor_si128(res0, res1); - crc0 = _mm_cvtsi128_si64(res); - crc0 = crc0 ^ *((uint64_t*)next2 - 1); - crc2 = _mm_crc32_u64(crc2, crc0); - return crc2; + return crc32c_slow; } -// Compute CRC-32C using the Intel hardware instruction. -static inline uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) { - const unsigned char* next = (const unsigned char*)buf; - uint64_t count; - uint64_t crc0, crc1, crc2; - crc0 = crc ^ 0xffffffffu; - - - if (len >= 8) { - // if len > 216 then align and use triplets - if (len > 216) { - { - // Work on the bytes (< 8) before the first 8-byte alignment addr starts - auto align_bytes = (8 - (uintptr_t)next) & 7; - len -= align_bytes; - align_to_8(align_bytes, crc0, next); - } - - // Now work on the remaining blocks - count = len / 24; // number of triplets - len %= 24; // bytes remaining - uint64_t n = count >> 7; // #blocks = first block + full blocks - uint64_t block_size = count & 127; - if (block_size == 0) { - block_size = 128; - } else { - n++; - } - // points to the first byte of the next block - const uint64_t* next0 = (uint64_t*)next + block_size; - const uint64_t* next1 = next0 + block_size; - const uint64_t* next2 = next1 + block_size; +static const Function ChosenExtend= Choose_Extend(); - crc1 = crc2 = 0; - // Use Duff's device, a for() loop inside a switch() - // statement. This needs to execute at least once, round len - // down to nearest triplet multiple - switch (block_size) { - case 128: - do { - // jumps here for a full block of len 128 - CRCtriplet(crc, next, -128); - /* fallthrough */ - case 127: - // jumps here or below for the first block smaller - CRCtriplet(crc, next, -127); - /* fallthrough */ - case 126: - CRCtriplet(crc, next, -126); // than 128 - /* fallthrough */ - case 125: - CRCtriplet(crc, next, -125); - /* fallthrough */ - case 124: - CRCtriplet(crc, next, -124); - /* fallthrough */ - case 123: - CRCtriplet(crc, next, -123); - /* fallthrough */ - case 122: - CRCtriplet(crc, next, -122); - /* fallthrough */ - case 121: - CRCtriplet(crc, next, -121); - /* fallthrough */ - case 120: - CRCtriplet(crc, next, -120); - /* fallthrough */ - case 119: - CRCtriplet(crc, next, -119); - /* fallthrough */ - case 118: - CRCtriplet(crc, next, -118); - /* fallthrough */ - case 117: - CRCtriplet(crc, next, -117); - /* fallthrough */ - case 116: - CRCtriplet(crc, next, -116); - /* fallthrough */ - case 115: - CRCtriplet(crc, next, -115); - /* fallthrough */ - case 114: - CRCtriplet(crc, next, -114); - /* fallthrough */ - case 113: - CRCtriplet(crc, next, -113); - /* fallthrough */ - case 112: - CRCtriplet(crc, next, -112); - /* fallthrough */ - case 111: - CRCtriplet(crc, next, -111); - /* fallthrough */ - case 110: - CRCtriplet(crc, next, -110); - /* fallthrough */ - case 109: - CRCtriplet(crc, next, -109); - /* fallthrough */ - case 108: - CRCtriplet(crc, next, -108); - /* fallthrough */ - case 107: - CRCtriplet(crc, next, -107); - /* fallthrough */ - case 106: - CRCtriplet(crc, next, -106); - /* fallthrough */ - case 105: - CRCtriplet(crc, next, -105); - /* fallthrough */ - case 104: - CRCtriplet(crc, next, -104); - /* fallthrough */ - case 103: - CRCtriplet(crc, next, -103); - /* fallthrough */ - case 102: - CRCtriplet(crc, next, -102); - /* fallthrough */ - case 101: - CRCtriplet(crc, next, -101); - /* fallthrough */ - case 100: - CRCtriplet(crc, next, -100); - /* fallthrough */ - case 99: - CRCtriplet(crc, next, -99); - /* fallthrough */ - case 98: - CRCtriplet(crc, next, -98); - /* fallthrough */ - case 97: - CRCtriplet(crc, next, -97); - /* fallthrough */ - case 96: - CRCtriplet(crc, next, -96); - /* fallthrough */ - case 95: - CRCtriplet(crc, next, -95); - /* fallthrough */ - case 94: - CRCtriplet(crc, next, -94); - /* fallthrough */ - case 93: - CRCtriplet(crc, next, -93); - /* fallthrough */ - case 92: - CRCtriplet(crc, next, -92); - /* fallthrough */ - case 91: - CRCtriplet(crc, next, -91); - /* fallthrough */ - case 90: - CRCtriplet(crc, next, -90); - /* fallthrough */ - case 89: - CRCtriplet(crc, next, -89); - /* fallthrough */ - case 88: - CRCtriplet(crc, next, -88); - /* fallthrough */ - case 87: - CRCtriplet(crc, next, -87); - /* fallthrough */ - case 86: - CRCtriplet(crc, next, -86); - /* fallthrough */ - case 85: - CRCtriplet(crc, next, -85); - /* fallthrough */ - case 84: - CRCtriplet(crc, next, -84); - /* fallthrough */ - case 83: - CRCtriplet(crc, next, -83); - /* fallthrough */ - case 82: - CRCtriplet(crc, next, -82); - /* fallthrough */ - case 81: - CRCtriplet(crc, next, -81); - /* fallthrough */ - case 80: - CRCtriplet(crc, next, -80); - /* fallthrough */ - case 79: - CRCtriplet(crc, next, -79); - /* fallthrough */ - case 78: - CRCtriplet(crc, next, -78); - /* fallthrough */ - case 77: - CRCtriplet(crc, next, -77); - /* fallthrough */ - case 76: - CRCtriplet(crc, next, -76); - /* fallthrough */ - case 75: - CRCtriplet(crc, next, -75); - /* fallthrough */ - case 74: - CRCtriplet(crc, next, -74); - /* fallthrough */ - case 73: - CRCtriplet(crc, next, -73); - /* fallthrough */ - case 72: - CRCtriplet(crc, next, -72); - /* fallthrough */ - case 71: - CRCtriplet(crc, next, -71); - /* fallthrough */ - case 70: - CRCtriplet(crc, next, -70); - /* fallthrough */ - case 69: - CRCtriplet(crc, next, -69); - /* fallthrough */ - case 68: - CRCtriplet(crc, next, -68); - /* fallthrough */ - case 67: - CRCtriplet(crc, next, -67); - /* fallthrough */ - case 66: - CRCtriplet(crc, next, -66); - /* fallthrough */ - case 65: - CRCtriplet(crc, next, -65); - /* fallthrough */ - case 64: - CRCtriplet(crc, next, -64); - /* fallthrough */ - case 63: - CRCtriplet(crc, next, -63); - /* fallthrough */ - case 62: - CRCtriplet(crc, next, -62); - /* fallthrough */ - case 61: - CRCtriplet(crc, next, -61); - /* fallthrough */ - case 60: - CRCtriplet(crc, next, -60); - /* fallthrough */ - case 59: - CRCtriplet(crc, next, -59); - /* fallthrough */ - case 58: - CRCtriplet(crc, next, -58); - /* fallthrough */ - case 57: - CRCtriplet(crc, next, -57); - /* fallthrough */ - case 56: - CRCtriplet(crc, next, -56); - /* fallthrough */ - case 55: - CRCtriplet(crc, next, -55); - /* fallthrough */ - case 54: - CRCtriplet(crc, next, -54); - /* fallthrough */ - case 53: - CRCtriplet(crc, next, -53); - /* fallthrough */ - case 52: - CRCtriplet(crc, next, -52); - /* fallthrough */ - case 51: - CRCtriplet(crc, next, -51); - /* fallthrough */ - case 50: - CRCtriplet(crc, next, -50); - /* fallthrough */ - case 49: - CRCtriplet(crc, next, -49); - /* fallthrough */ - case 48: - CRCtriplet(crc, next, -48); - /* fallthrough */ - case 47: - CRCtriplet(crc, next, -47); - /* fallthrough */ - case 46: - CRCtriplet(crc, next, -46); - /* fallthrough */ - case 45: - CRCtriplet(crc, next, -45); - /* fallthrough */ - case 44: - CRCtriplet(crc, next, -44); - /* fallthrough */ - case 43: - CRCtriplet(crc, next, -43); - /* fallthrough */ - case 42: - CRCtriplet(crc, next, -42); - /* fallthrough */ - case 41: - CRCtriplet(crc, next, -41); - /* fallthrough */ - case 40: - CRCtriplet(crc, next, -40); - /* fallthrough */ - case 39: - CRCtriplet(crc, next, -39); - /* fallthrough */ - case 38: - CRCtriplet(crc, next, -38); - /* fallthrough */ - case 37: - CRCtriplet(crc, next, -37); - /* fallthrough */ - case 36: - CRCtriplet(crc, next, -36); - /* fallthrough */ - case 35: - CRCtriplet(crc, next, -35); - /* fallthrough */ - case 34: - CRCtriplet(crc, next, -34); - /* fallthrough */ - case 33: - CRCtriplet(crc, next, -33); - /* fallthrough */ - case 32: - CRCtriplet(crc, next, -32); - /* fallthrough */ - case 31: - CRCtriplet(crc, next, -31); - /* fallthrough */ - case 30: - CRCtriplet(crc, next, -30); - /* fallthrough */ - case 29: - CRCtriplet(crc, next, -29); - /* fallthrough */ - case 28: - CRCtriplet(crc, next, -28); - /* fallthrough */ - case 27: - CRCtriplet(crc, next, -27); - /* fallthrough */ - case 26: - CRCtriplet(crc, next, -26); - /* fallthrough */ - case 25: - CRCtriplet(crc, next, -25); - /* fallthrough */ - case 24: - CRCtriplet(crc, next, -24); - /* fallthrough */ - case 23: - CRCtriplet(crc, next, -23); - /* fallthrough */ - case 22: - CRCtriplet(crc, next, -22); - /* fallthrough */ - case 21: - CRCtriplet(crc, next, -21); - /* fallthrough */ - case 20: - CRCtriplet(crc, next, -20); - /* fallthrough */ - case 19: - CRCtriplet(crc, next, -19); - /* fallthrough */ - case 18: - CRCtriplet(crc, next, -18); - /* fallthrough */ - case 17: - CRCtriplet(crc, next, -17); - /* fallthrough */ - case 16: - CRCtriplet(crc, next, -16); - /* fallthrough */ - case 15: - CRCtriplet(crc, next, -15); - /* fallthrough */ - case 14: - CRCtriplet(crc, next, -14); - /* fallthrough */ - case 13: - CRCtriplet(crc, next, -13); - /* fallthrough */ - case 12: - CRCtriplet(crc, next, -12); - /* fallthrough */ - case 11: - CRCtriplet(crc, next, -11); - /* fallthrough */ - case 10: - CRCtriplet(crc, next, -10); - /* fallthrough */ - case 9: - CRCtriplet(crc, next, -9); - /* fallthrough */ - case 8: - CRCtriplet(crc, next, -8); - /* fallthrough */ - case 7: - CRCtriplet(crc, next, -7); - /* fallthrough */ - case 6: - CRCtriplet(crc, next, -6); - /* fallthrough */ - case 5: - CRCtriplet(crc, next, -5); - /* fallthrough */ - case 4: - CRCtriplet(crc, next, -4); - /* fallthrough */ - case 3: - CRCtriplet(crc, next, -3); - /* fallthrough */ - case 2: - CRCtriplet(crc, next, -2); - /* fallthrough */ - case 1: - CRCduplet(crc, next, -1); // the final triplet is actually only 2 - //{ CombineCRC(); } - crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2); - if (--n > 0) { - crc1 = crc2 = 0; - block_size = 128; - // points to the first byte of the next block - next0 = next2 + 128; - next1 = next0 + 128; // from here on all blocks are 128 long - next2 = next1 + 128; - } - /* fallthrough */ - case 0:; - } while (n > 0); - } - next = (const unsigned char*)next2; - } - uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets - len = len & 7; - next += (count2 * 8); - switch (count2) { - case 27: - CRCsinglet(crc0, next, -27 * 8); - /* fallthrough */ - case 26: - CRCsinglet(crc0, next, -26 * 8); - /* fallthrough */ - case 25: - CRCsinglet(crc0, next, -25 * 8); - /* fallthrough */ - case 24: - CRCsinglet(crc0, next, -24 * 8); - /* fallthrough */ - case 23: - CRCsinglet(crc0, next, -23 * 8); - /* fallthrough */ - case 22: - CRCsinglet(crc0, next, -22 * 8); - /* fallthrough */ - case 21: - CRCsinglet(crc0, next, -21 * 8); - /* fallthrough */ - case 20: - CRCsinglet(crc0, next, -20 * 8); - /* fallthrough */ - case 19: - CRCsinglet(crc0, next, -19 * 8); - /* fallthrough */ - case 18: - CRCsinglet(crc0, next, -18 * 8); - /* fallthrough */ - case 17: - CRCsinglet(crc0, next, -17 * 8); - /* fallthrough */ - case 16: - CRCsinglet(crc0, next, -16 * 8); - /* fallthrough */ - case 15: - CRCsinglet(crc0, next, -15 * 8); - /* fallthrough */ - case 14: - CRCsinglet(crc0, next, -14 * 8); - /* fallthrough */ - case 13: - CRCsinglet(crc0, next, -13 * 8); - /* fallthrough */ - case 12: - CRCsinglet(crc0, next, -12 * 8); - /* fallthrough */ - case 11: - CRCsinglet(crc0, next, -11 * 8); - /* fallthrough */ - case 10: - CRCsinglet(crc0, next, -10 * 8); - /* fallthrough */ - case 9: - CRCsinglet(crc0, next, -9 * 8); - /* fallthrough */ - case 8: - CRCsinglet(crc0, next, -8 * 8); - /* fallthrough */ - case 7: - CRCsinglet(crc0, next, -7 * 8); - /* fallthrough */ - case 6: - CRCsinglet(crc0, next, -6 * 8); - /* fallthrough */ - case 5: - CRCsinglet(crc0, next, -5 * 8); - /* fallthrough */ - case 4: - CRCsinglet(crc0, next, -4 * 8); - /* fallthrough */ - case 3: - CRCsinglet(crc0, next, -3 * 8); - /* fallthrough */ - case 2: - CRCsinglet(crc0, next, -2 * 8); - /* fallthrough */ - case 1: - CRCsinglet(crc0, next, -1 * 8); - /* fallthrough */ - case 0:; - } - } - { - align_to_8(len, crc0, next); - return (uint32_t)crc0 ^ 0xffffffffu; - } +static inline uint32_t Extend(uint32_t crc, const char* buf, size_t size) +{ + return ChosenExtend(crc, buf, size); } -#else -#define NO_THREEWAY_CRC32C -#endif //HAVE_SSE42 && HAVE_PCLMUL - -static inline Function Choose_Extend() { -#ifdef HAVE_POWER8 - return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>; +extern "C" const char *my_crc32c_implementation() +{ +#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) + if (ChosenExtend == ExtendPPCImpl) + return "Using POWER8 crc32 instructions"; #elif defined(HAVE_ARMV8_CRC) - if(crc32c_aarch64_available()) { - return ExtendARMImpl; - } else { - return ExtendImpl<Slow_CRC32>; - } -#else - if (isSSE42()) { - if (isPCLMULQDQ()) { -#if defined HAVE_SSE42 && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C - return crc32c_3way; -#else - return ExtendImpl<Fast_CRC32>; // Fast_CRC32 will check HAVE_SSE42 itself -#endif - } - else { // no runtime PCLMULQDQ support but has SSE42 support - return ExtendImpl<Fast_CRC32>; - } - } // end of isSSE42() - else { - return ExtendImpl<Slow_CRC32>; - } + if (const char *ret= crc32c_aarch64_available()) + return ret; +#elif HAVE_SSE42 +# if defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8 + if (ChosenExtend == crc32c_3way) + return "Using crc32 + pclmulqdq instructions"; +# endif + if (ChosenExtend == crc32c_sse42) + return "Using SSE4.2 crc32 instructions"; #endif -} - -static const Function ChosenExtend = Choose_Extend(); - -static inline uint32_t Extend(uint32_t crc, const char* buf, size_t size) { - return ChosenExtend(crc, buf, size); + return "Using generic crc32 instructions"; } } // namespace crc32c } // namespace mysys_namespace -extern "C" unsigned int my_crc32c(unsigned int crc, const char *buf, size_t size) +extern "C" unsigned my_crc32c(unsigned int crc, const char *buf, size_t size) { return mysys_namespace::crc32c::Extend(crc,buf, size); } diff --git a/mysys/crc32/crc32c_amd64.cc b/mysys/crc32/crc32c_amd64.cc new file mode 100644 index 00000000000..22c492b457f --- /dev/null +++ b/mysys/crc32/crc32c_amd64.cc @@ -0,0 +1,711 @@ +/* Copyright (c) 2020, 2021, MariaDB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * Ferry Toth + * ftoth@exalondelft.nl + * + * https://github.com/htot/crc32c + * + * Modified by Facebook + * + * Original intel whitepaper: + * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction" + * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * + * This version is from the folly library, created by Dave Watson <davejwatson@fb.com> + * +*/ + +#include <stdint.h> +#include <nmmintrin.h> +#include <wmmintrin.h> + + +#define CRCtriplet(crc, buf, offset) \ + crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ + crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \ + crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset)); + +#define CRCduplet(crc, buf, offset) \ + crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ + crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); + +#define CRCsinglet(crc, buf, offset) \ + crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset)); + + +// Numbers taken directly from intel whitepaper. +// clang-format off +static const uint64_t clmul_constants alignas(16) [] = { + 0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6, + 0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e, + 0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da, + 0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8, + 0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296, + 0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2, + 0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6, + 0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092, + 0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0, + 0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456, + 0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e, + 0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a, + 0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574, + 0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832, + 0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124, + 0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86, + 0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e, + 0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a, + 0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46, + 0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a, + 0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a, + 0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4, + 0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56, + 0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2, + 0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c, + 0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac, + 0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64, + 0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e, + 0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c, + 0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28, + 0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26, + 0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c, + 0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c, + 0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c, + 0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4, + 0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844, + 0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c, + 0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730, + 0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c, + 0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2, + 0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2, + 0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e, + 0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a, + 0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a, + 0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a, + 0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768, + 0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4, + 0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c, + 0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba, + 0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312, + 0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544, + 0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a, + 0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e, + 0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a, + 0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c, + 0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a, + 0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6, + 0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca, + 0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888, + 0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e, + 0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528, + 0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a, + 0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e, + 0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa, +}; + +// Compute the crc32c value for buffer smaller than 8 +static inline void align_to_8( + size_t len, + uint64_t& crc0, // crc so far, updated on return + const unsigned char*& next) { // next data pointer, updated on return + uint32_t crc32bit = static_cast<uint32_t>(crc0); + if (len & 0x04) { + crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next); + next += sizeof(uint32_t); + } + if (len & 0x02) { + crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next); + next += sizeof(uint16_t); + } + if (len & 0x01) { + crc32bit = _mm_crc32_u8(crc32bit, *(next)); + next++; + } + crc0 = crc32bit; +} + +// +// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well +// chosen constant and xor's these with the remaining CRC. +// +static inline uint64_t CombineCRC( + size_t block_size, + uint64_t crc0, + uint64_t crc1, + uint64_t crc2, + const uint64_t* next2) { + const auto multiplier = + *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1); + const auto crc0_xmm = _mm_set_epi64x(0, crc0); + const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00); + const auto crc1_xmm = _mm_set_epi64x(0, crc1); + const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10); + const auto res = _mm_xor_si128(res0, res1); + crc0 = _mm_cvtsi128_si64(res); + crc0 = crc0 ^ *((uint64_t*)next2 - 1); + crc2 = _mm_crc32_u64(crc2, crc0); + return crc2; +} + +// Compute CRC-32C using the Intel hardware instruction. +extern "C" +uint32_t crc32c_3way(uint32_t crc, const char *buf, size_t len) +{ + const unsigned char* next = (const unsigned char*)buf; + uint64_t count; + uint64_t crc0, crc1, crc2; + crc0 = crc ^ 0xffffffffu; + + + if (len >= 8) { + // if len > 216 then align and use triplets + if (len > 216) { + { + // Work on the bytes (< 8) before the first 8-byte alignment addr starts + auto align_bytes = (8 - (uintptr_t)next) & 7; + len -= align_bytes; + align_to_8(align_bytes, crc0, next); + } + + // Now work on the remaining blocks + count = len / 24; // number of triplets + len %= 24; // bytes remaining + uint64_t n = count >> 7; // #blocks = first block + full blocks + uint64_t block_size = count & 127; + if (block_size == 0) { + block_size = 128; + } else { + n++; + } + // points to the first byte of the next block + const uint64_t* next0 = (uint64_t*)next + block_size; + const uint64_t* next1 = next0 + block_size; + const uint64_t* next2 = next1 + block_size; + + crc1 = crc2 = 0; + // Use Duff's device, a for() loop inside a switch() + // statement. This needs to execute at least once, round len + // down to nearest triplet multiple + switch (block_size) { + case 128: + do { + // jumps here for a full block of len 128 + CRCtriplet(crc, next, -128); + /* fallthrough */ + case 127: + // jumps here or below for the first block smaller + CRCtriplet(crc, next, -127); + /* fallthrough */ + case 126: + CRCtriplet(crc, next, -126); // than 128 + /* fallthrough */ + case 125: + CRCtriplet(crc, next, -125); + /* fallthrough */ + case 124: + CRCtriplet(crc, next, -124); + /* fallthrough */ + case 123: + CRCtriplet(crc, next, -123); + /* fallthrough */ + case 122: + CRCtriplet(crc, next, -122); + /* fallthrough */ + case 121: + CRCtriplet(crc, next, -121); + /* fallthrough */ + case 120: + CRCtriplet(crc, next, -120); + /* fallthrough */ + case 119: + CRCtriplet(crc, next, -119); + /* fallthrough */ + case 118: + CRCtriplet(crc, next, -118); + /* fallthrough */ + case 117: + CRCtriplet(crc, next, -117); + /* fallthrough */ + case 116: + CRCtriplet(crc, next, -116); + /* fallthrough */ + case 115: + CRCtriplet(crc, next, -115); + /* fallthrough */ + case 114: + CRCtriplet(crc, next, -114); + /* fallthrough */ + case 113: + CRCtriplet(crc, next, -113); + /* fallthrough */ + case 112: + CRCtriplet(crc, next, -112); + /* fallthrough */ + case 111: + CRCtriplet(crc, next, -111); + /* fallthrough */ + case 110: + CRCtriplet(crc, next, -110); + /* fallthrough */ + case 109: + CRCtriplet(crc, next, -109); + /* fallthrough */ + case 108: + CRCtriplet(crc, next, -108); + /* fallthrough */ + case 107: + CRCtriplet(crc, next, -107); + /* fallthrough */ + case 106: + CRCtriplet(crc, next, -106); + /* fallthrough */ + case 105: + CRCtriplet(crc, next, -105); + /* fallthrough */ + case 104: + CRCtriplet(crc, next, -104); + /* fallthrough */ + case 103: + CRCtriplet(crc, next, -103); + /* fallthrough */ + case 102: + CRCtriplet(crc, next, -102); + /* fallthrough */ + case 101: + CRCtriplet(crc, next, -101); + /* fallthrough */ + case 100: + CRCtriplet(crc, next, -100); + /* fallthrough */ + case 99: + CRCtriplet(crc, next, -99); + /* fallthrough */ + case 98: + CRCtriplet(crc, next, -98); + /* fallthrough */ + case 97: + CRCtriplet(crc, next, -97); + /* fallthrough */ + case 96: + CRCtriplet(crc, next, -96); + /* fallthrough */ + case 95: + CRCtriplet(crc, next, -95); + /* fallthrough */ + case 94: + CRCtriplet(crc, next, -94); + /* fallthrough */ + case 93: + CRCtriplet(crc, next, -93); + /* fallthrough */ + case 92: + CRCtriplet(crc, next, -92); + /* fallthrough */ + case 91: + CRCtriplet(crc, next, -91); + /* fallthrough */ + case 90: + CRCtriplet(crc, next, -90); + /* fallthrough */ + case 89: + CRCtriplet(crc, next, -89); + /* fallthrough */ + case 88: + CRCtriplet(crc, next, -88); + /* fallthrough */ + case 87: + CRCtriplet(crc, next, -87); + /* fallthrough */ + case 86: + CRCtriplet(crc, next, -86); + /* fallthrough */ + case 85: + CRCtriplet(crc, next, -85); + /* fallthrough */ + case 84: + CRCtriplet(crc, next, -84); + /* fallthrough */ + case 83: + CRCtriplet(crc, next, -83); + /* fallthrough */ + case 82: + CRCtriplet(crc, next, -82); + /* fallthrough */ + case 81: + CRCtriplet(crc, next, -81); + /* fallthrough */ + case 80: + CRCtriplet(crc, next, -80); + /* fallthrough */ + case 79: + CRCtriplet(crc, next, -79); + /* fallthrough */ + case 78: + CRCtriplet(crc, next, -78); + /* fallthrough */ + case 77: + CRCtriplet(crc, next, -77); + /* fallthrough */ + case 76: + CRCtriplet(crc, next, -76); + /* fallthrough */ + case 75: + CRCtriplet(crc, next, -75); + /* fallthrough */ + case 74: + CRCtriplet(crc, next, -74); + /* fallthrough */ + case 73: + CRCtriplet(crc, next, -73); + /* fallthrough */ + case 72: + CRCtriplet(crc, next, -72); + /* fallthrough */ + case 71: + CRCtriplet(crc, next, -71); + /* fallthrough */ + case 70: + CRCtriplet(crc, next, -70); + /* fallthrough */ + case 69: + CRCtriplet(crc, next, -69); + /* fallthrough */ + case 68: + CRCtriplet(crc, next, -68); + /* fallthrough */ + case 67: + CRCtriplet(crc, next, -67); + /* fallthrough */ + case 66: + CRCtriplet(crc, next, -66); + /* fallthrough */ + case 65: + CRCtriplet(crc, next, -65); + /* fallthrough */ + case 64: + CRCtriplet(crc, next, -64); + /* fallthrough */ + case 63: + CRCtriplet(crc, next, -63); + /* fallthrough */ + case 62: + CRCtriplet(crc, next, -62); + /* fallthrough */ + case 61: + CRCtriplet(crc, next, -61); + /* fallthrough */ + case 60: + CRCtriplet(crc, next, -60); + /* fallthrough */ + case 59: + CRCtriplet(crc, next, -59); + /* fallthrough */ + case 58: + CRCtriplet(crc, next, -58); + /* fallthrough */ + case 57: + CRCtriplet(crc, next, -57); + /* fallthrough */ + case 56: + CRCtriplet(crc, next, -56); + /* fallthrough */ + case 55: + CRCtriplet(crc, next, -55); + /* fallthrough */ + case 54: + CRCtriplet(crc, next, -54); + /* fallthrough */ + case 53: + CRCtriplet(crc, next, -53); + /* fallthrough */ + case 52: + CRCtriplet(crc, next, -52); + /* fallthrough */ + case 51: + CRCtriplet(crc, next, -51); + /* fallthrough */ + case 50: + CRCtriplet(crc, next, -50); + /* fallthrough */ + case 49: + CRCtriplet(crc, next, -49); + /* fallthrough */ + case 48: + CRCtriplet(crc, next, -48); + /* fallthrough */ + case 47: + CRCtriplet(crc, next, -47); + /* fallthrough */ + case 46: + CRCtriplet(crc, next, -46); + /* fallthrough */ + case 45: + CRCtriplet(crc, next, -45); + /* fallthrough */ + case 44: + CRCtriplet(crc, next, -44); + /* fallthrough */ + case 43: + CRCtriplet(crc, next, -43); + /* fallthrough */ + case 42: + CRCtriplet(crc, next, -42); + /* fallthrough */ + case 41: + CRCtriplet(crc, next, -41); + /* fallthrough */ + case 40: + CRCtriplet(crc, next, -40); + /* fallthrough */ + case 39: + CRCtriplet(crc, next, -39); + /* fallthrough */ + case 38: + CRCtriplet(crc, next, -38); + /* fallthrough */ + case 37: + CRCtriplet(crc, next, -37); + /* fallthrough */ + case 36: + CRCtriplet(crc, next, -36); + /* fallthrough */ + case 35: + CRCtriplet(crc, next, -35); + /* fallthrough */ + case 34: + CRCtriplet(crc, next, -34); + /* fallthrough */ + case 33: + CRCtriplet(crc, next, -33); + /* fallthrough */ + case 32: + CRCtriplet(crc, next, -32); + /* fallthrough */ + case 31: + CRCtriplet(crc, next, -31); + /* fallthrough */ + case 30: + CRCtriplet(crc, next, -30); + /* fallthrough */ + case 29: + CRCtriplet(crc, next, -29); + /* fallthrough */ + case 28: + CRCtriplet(crc, next, -28); + /* fallthrough */ + case 27: + CRCtriplet(crc, next, -27); + /* fallthrough */ + case 26: + CRCtriplet(crc, next, -26); + /* fallthrough */ + case 25: + CRCtriplet(crc, next, -25); + /* fallthrough */ + case 24: + CRCtriplet(crc, next, -24); + /* fallthrough */ + case 23: + CRCtriplet(crc, next, -23); + /* fallthrough */ + case 22: + CRCtriplet(crc, next, -22); + /* fallthrough */ + case 21: + CRCtriplet(crc, next, -21); + /* fallthrough */ + case 20: + CRCtriplet(crc, next, -20); + /* fallthrough */ + case 19: + CRCtriplet(crc, next, -19); + /* fallthrough */ + case 18: + CRCtriplet(crc, next, -18); + /* fallthrough */ + case 17: + CRCtriplet(crc, next, -17); + /* fallthrough */ + case 16: + CRCtriplet(crc, next, -16); + /* fallthrough */ + case 15: + CRCtriplet(crc, next, -15); + /* fallthrough */ + case 14: + CRCtriplet(crc, next, -14); + /* fallthrough */ + case 13: + CRCtriplet(crc, next, -13); + /* fallthrough */ + case 12: + CRCtriplet(crc, next, -12); + /* fallthrough */ + case 11: + CRCtriplet(crc, next, -11); + /* fallthrough */ + case 10: + CRCtriplet(crc, next, -10); + /* fallthrough */ + case 9: + CRCtriplet(crc, next, -9); + /* fallthrough */ + case 8: + CRCtriplet(crc, next, -8); + /* fallthrough */ + case 7: + CRCtriplet(crc, next, -7); + /* fallthrough */ + case 6: + CRCtriplet(crc, next, -6); + /* fallthrough */ + case 5: + CRCtriplet(crc, next, -5); + /* fallthrough */ + case 4: + CRCtriplet(crc, next, -4); + /* fallthrough */ + case 3: + CRCtriplet(crc, next, -3); + /* fallthrough */ + case 2: + CRCtriplet(crc, next, -2); + /* fallthrough */ + case 1: + CRCduplet(crc, next, -1); // the final triplet is actually only 2 + //{ CombineCRC(); } + crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2); + if (--n > 0) { + crc1 = crc2 = 0; + block_size = 128; + // points to the first byte of the next block + next0 = next2 + 128; + next1 = next0 + 128; // from here on all blocks are 128 long + next2 = next1 + 128; + } + /* fallthrough */ + case 0:; + } while (n > 0); + } + next = (const unsigned char*)next2; + } + uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets + len = len & 7; + next += (count2 * 8); + switch (count2) { + case 27: + CRCsinglet(crc0, next, -27 * 8); + /* fallthrough */ + case 26: + CRCsinglet(crc0, next, -26 * 8); + /* fallthrough */ + case 25: + CRCsinglet(crc0, next, -25 * 8); + /* fallthrough */ + case 24: + CRCsinglet(crc0, next, -24 * 8); + /* fallthrough */ + case 23: + CRCsinglet(crc0, next, -23 * 8); + /* fallthrough */ + case 22: + CRCsinglet(crc0, next, -22 * 8); + /* fallthrough */ + case 21: + CRCsinglet(crc0, next, -21 * 8); + /* fallthrough */ + case 20: + CRCsinglet(crc0, next, -20 * 8); + /* fallthrough */ + case 19: + CRCsinglet(crc0, next, -19 * 8); + /* fallthrough */ + case 18: + CRCsinglet(crc0, next, -18 * 8); + /* fallthrough */ + case 17: + CRCsinglet(crc0, next, -17 * 8); + /* fallthrough */ + case 16: + CRCsinglet(crc0, next, -16 * 8); + /* fallthrough */ + case 15: + CRCsinglet(crc0, next, -15 * 8); + /* fallthrough */ + case 14: + CRCsinglet(crc0, next, -14 * 8); + /* fallthrough */ + case 13: + CRCsinglet(crc0, next, -13 * 8); + /* fallthrough */ + case 12: + CRCsinglet(crc0, next, -12 * 8); + /* fallthrough */ + case 11: + CRCsinglet(crc0, next, -11 * 8); + /* fallthrough */ + case 10: + CRCsinglet(crc0, next, -10 * 8); + /* fallthrough */ + case 9: + CRCsinglet(crc0, next, -9 * 8); + /* fallthrough */ + case 8: + CRCsinglet(crc0, next, -8 * 8); + /* fallthrough */ + case 7: + CRCsinglet(crc0, next, -7 * 8); + /* fallthrough */ + case 6: + CRCsinglet(crc0, next, -6 * 8); + /* fallthrough */ + case 5: + CRCsinglet(crc0, next, -5 * 8); + /* fallthrough */ + case 4: + CRCsinglet(crc0, next, -4 * 8); + /* fallthrough */ + case 3: + CRCsinglet(crc0, next, -3 * 8); + /* fallthrough */ + case 2: + CRCsinglet(crc0, next, -2 * 8); + /* fallthrough */ + case 1: + CRCsinglet(crc0, next, -1 * 8); + /* fallthrough */ + case 0:; + } + } + { + align_to_8(len, crc0, next); + return (uint32_t)crc0 ^ 0xffffffffu; + } +} diff --git a/mysys/crc32ieee.cc b/mysys/crc32ieee.cc index 5f8344b4f9d..bbafa1230f8 100644 --- a/mysys/crc32ieee.cc +++ b/mysys/crc32ieee.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2020, 2021, MariaDB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -39,25 +39,23 @@ typedef unsigned int (*my_crc32_t)(unsigned int, const void *, size_t); static my_crc32_t init_crc32() { - my_crc32_t func= my_crc32_zlib; #ifdef HAVE_PCLMUL if (crc32_pclmul_enabled()) - func = crc32_pclmul; + return crc32_pclmul; #elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC) if (crc32_aarch64_available()) - func= crc32_aarch64; + return crc32_aarch64; #endif - return func; + return my_crc32_zlib; } static const my_crc32_t my_checksum_func= init_crc32(); -#ifndef __powerpc64__ -/* For powerpc, my_checksum is defined elsewhere.*/ -extern "C" unsigned int my_checksum(unsigned int crc, const void *data, size_t len) +#ifdef __powerpc64__ +# error "my_checksum() is defined in mysys/crc32/crc32_ppc64.c" +#endif +extern "C" +unsigned int my_checksum(unsigned int crc, const void *data, size_t len) { return my_checksum_func(crc, data, len); } -#endif - - |