summaryrefslogtreecommitdiff
path: root/snappy.cc
diff options
context:
space:
mode:
Diffstat (limited to 'snappy.cc')
-rw-r--r--snappy.cc34
1 files changed, 34 insertions, 0 deletions
diff --git a/snappy.cc b/snappy.cc
index 3b0de12..4008e76 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -30,6 +30,27 @@
#include "snappy-sinksource.h"
#include "snappy.h"
+#if !defined(SNAPPY_HAVE_BMI2)
+// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
+// specifically, but it does define __AVX2__ when AVX2 support is available.
+// Fortunately, AVX2 was introduced in Haswell, just like BMI2.
+//
+// BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
+// GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
+// case issuing BMI2 instructions results in a compiler error.
+#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
+#define SNAPPY_HAVE_BMI2 1
+#else
+#define SNAPPY_HAVE_BMI2 0
+#endif
+#endif // !defined(SNAPPY_HAVE_BMI2)
+
+#if SNAPPY_HAVE_BMI2
+// Please do not replace with <x86intrin.h>. or with headers that assume more
+// advanced SSE versions without checking with all the OWNERS.
+#include <immintrin.h>
+#endif
+
#include <algorithm>
#include <array>
#include <cstddef>
@@ -1060,6 +1081,10 @@ inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) {
reinterpret_cast<const char*>(&kExtractMasksCombined) + 2 * tag_type,
sizeof(result));
return val & result;
+#elif defined(__aarch64__)
+ constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull;
+ return val & static_cast<uint32_t>(
+ (kExtractMasksCombined >> (tag_type * 16)) & 0xFFFF);
#else
static constexpr uint32_t kExtractMasks[4] = {0, 0xFF, 0xFFFF, 0};
return val & kExtractMasks[tag_type];
@@ -1087,6 +1112,15 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
// ip points just past the tag and we are touching at maximum kSlopBytes
// in an iteration.
size_t tag = ip[-1];
+#if defined(__clang__) && defined(__aarch64__)
+ // Workaround for https://bugs.llvm.org/show_bug.cgi?id=51317
+ // when loading 1 byte, clang for aarch64 doesn't realize that it(ldrb)
+ // comes with free zero-extension, so clang generates another
+ // 'and xn, xm, 0xff' before it use that as the offset. This 'and' is
+ // redundant and can be removed by adding this dummy asm, which gives
+ // clang a hint that we're doing the zero-extension at the load.
+ asm("" ::"r"(tag));
+#endif
do {
// The throughput is limited by instructions, unrolling the inner loop
// twice reduces the amount of instructions checking limits and also