diff options
Diffstat (limited to 'snappy.cc')
-rw-r--r-- | snappy.cc | 34 |
1 files changed, 34 insertions, 0 deletions
@@ -30,6 +30,27 @@ #include "snappy-sinksource.h" #include "snappy.h" +#if !defined(SNAPPY_HAVE_BMI2) +// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2 +// specifically, but it does define __AVX2__ when AVX2 support is available. +// Fortunately, AVX2 was introduced in Haswell, just like BMI2. +// +// BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So, +// GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which +// case issuing BMI2 instructions results in a compiler error. +#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) +#define SNAPPY_HAVE_BMI2 1 +#else +#define SNAPPY_HAVE_BMI2 0 +#endif +#endif // !defined(SNAPPY_HAVE_BMI2) + +#if SNAPPY_HAVE_BMI2 +// Please do not replace with <x86intrin.h>. or with headers that assume more +// advanced SSE versions without checking with all the OWNERS. +#include <immintrin.h> +#endif + #include <algorithm> #include <array> #include <cstddef> @@ -1060,6 +1081,10 @@ inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) { reinterpret_cast<const char*>(&kExtractMasksCombined) + 2 * tag_type, sizeof(result)); return val & result; +#elif defined(__aarch64__) + constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull; + return val & static_cast<uint32_t>( + (kExtractMasksCombined >> (tag_type * 16)) & 0xFFFF); #else static constexpr uint32_t kExtractMasks[4] = {0, 0xFF, 0xFFFF, 0}; return val & kExtractMasks[tag_type]; @@ -1087,6 +1112,15 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless( // ip points just past the tag and we are touching at maximum kSlopBytes // in an iteration. size_t tag = ip[-1]; +#if defined(__clang__) && defined(__aarch64__) + // Workaround for https://bugs.llvm.org/show_bug.cgi?id=51317 + // when loading 1 byte, clang for aarch64 doesn't realize that it(ldrb) + // comes with free zero-extension, so clang generates another + // 'and xn, xm, 0xff' before it use that as the offset. This 'and' is + // redundant and can be removed by adding this dummy asm, which gives + // clang a hint that we're doing the zero-extension at the load. + asm("" ::"r"(tag)); +#endif do { // The throughput is limited by instructions, unrolling the inner loop // twice reduces the amount of instructions checking limits and also |