Internal change

PiperOrigin-RevId: 345360683
author: Snappy Team <no-reply@google.com> 2020-12-03 03:15:10 +0000
committer: Victor Costan <costan@google.com> 2020-12-03 22:52:52 +0000
commit: 56c2c247d037ce4b274c93e4bb638662cb79378a (patch)
tree: cb7ba247d2a8f979a57d0894dc3b080d0d866668
parent: a94be58e65dac5d676c219199b015cfc32d7ae6e (diff)
download: snappy-git-56c2c247d037ce4b274c93e4bb638662cb79378a.tar.gz
1 files changed, 56 insertions, 224 deletions
diff --git a/snappy.cc b/snappy.cc
index 6ceccce..e5e69ab 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -74,7 +74,6 @@
 #include <cstdio>
 #include <cstring>
 #include <string>
-#include <utility>
 #include <vector>
 
 namespace snappy {
@@ -179,16 +178,6 @@ void UnalignedCopy128(const void* src, void* dst) {
   std::memcpy(dst, tmp, 16);
 }
 
-template <bool use_16bytes_chunk>
-inline void ConditionalUnalignedCopy128(const char* src, char* dst) {
-  if (use_16bytes_chunk) {
-    UnalignedCopy128(src, dst);
-  } else {
-    UnalignedCopy64(src, dst);
-    UnalignedCopy64(src + 8, dst + 8);
-  }
-}
-
 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
 // for handling COPY operations where the input and output regions may overlap.
 // For example, suppose:
@@ -216,164 +205,36 @@ inline char* IncrementalCopySlow(const char* src, char* op,
 
 #if SNAPPY_HAVE_SSSE3
 
-// Computes the bytes for shuffle control mask (please read comments on
-// 'pattern_generation_masks' as well) for the given index_offset and
-// pattern_size. For example, when the 'offset' is 6, it will generate a
-// repeating pattern of size 6. So, the first 16 byte indexes will correspond to
-// the pattern-bytes {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3} and the
-// next 16 byte indexes will correspond to the pattern-bytes {4, 5, 0, 1, 2, 3,
-// 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
-// calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and
-// MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively.
-template <size_t... indexes>
-inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
-    int index_offset, int pattern_size, index_sequence<indexes...>) {
-  return {static_cast<char>((index_offset + indexes) % pattern_size)...};
-}
-
-// Computes the shuffle control mask bytes array for given pattern-sizes and
-// returns an array.
-template <size_t... pattern_sizes_minus_one>
-inline constexpr std::array<std::array<char, sizeof(__m128i)>,
-                            sizeof...(pattern_sizes_minus_one)>
-MakePatternMaskBytesTable(int index_offset,
-                          index_sequence<pattern_sizes_minus_one...>) {
-  return {MakePatternMaskBytes(
-      index_offset, pattern_sizes_minus_one + 1,
-      make_index_sequence</*indexes=*/sizeof(__m128i)>())...};
-}
-
-// This is an array of shuffle control masks that can be used as the source
+// This is a table of shuffle control masks that can be used as the source
 // operand for PSHUFB to permute the contents of the destination XMM register
 // into a repeating byte pattern.
-alignas(16) inline constexpr std::array<std::array<char, sizeof(__m128i)>,
-                                        16> pattern_generation_masks =
-    MakePatternMaskBytesTable(
-        /*index_offset=*/0,
-        /*pattern_sizes_minus_one=*/make_index_sequence<16>());
-
-// Similar to 'pattern_generation_masks', this table is used to "rotate" the
-// pattern so that we can copy the *next 16 bytes* consistent with the pattern.
-// Basically, pattern_reshuffle_masks is a continuation of
-// pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
-// pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
-alignas(16) inline constexpr std::array<std::array<char, sizeof(__m128i)>,
-                                        16> pattern_reshuffle_masks =
-    MakePatternMaskBytesTable(
-        /*index_offset=*/16,
-        /*pattern_sizes_minus_one=*/make_index_sequence<16>());
-
-SNAPPY_ATTRIBUTE_ALWAYS_INLINE
-static inline __m128i LoadPattern(const char* src, const size_t pattern_size) {
-  __m128i generation_mask = _mm_load_si128(reinterpret_cast<const __m128i*>(
-      pattern_generation_masks[pattern_size - 1].data()));
-  // Uninitialized bytes are masked out by the shuffle mask.
-  // TODO: remove annotation and macro defs once MSan is fixed.
-  SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(src + pattern_size, 16 - pattern_size);
-  return _mm_shuffle_epi8(
-      _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)), generation_mask);
-}
-
-SNAPPY_ATTRIBUTE_ALWAYS_INLINE
-static inline std::pair<__m128i /* pattern */, __m128i /* reshuffle_mask */>
-LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
-  __m128i pattern = LoadPattern(src, pattern_size);
-
-  // This mask will generate the next 16 bytes in-place. Doing so enables us to
-  // write data by at most 4 _mm_storeu_si128.
-  //
-  // For example, suppose pattern is:        abcdefabcdefabcd
-  // Shuffling with this mask will generate: efabcdefabcdefab
-  // Shuffling again will generate:          cdefabcdefabcdef
-  __m128i reshuffle_mask = _mm_load_si128(reinterpret_cast<const __m128i*>(
-      pattern_reshuffle_masks[pattern_size - 1].data()));
-  return {pattern, reshuffle_mask};
-}
+alignas(16) const char pshufb_fill_patterns[7][16] = {
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
+    {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
+    {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
+    {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
+    {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
+    {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
+};
 
-#endif  // SNAPPY_HAVE_SSSE3
+// j * (16 / j) for all j from 0 to 7. 0 is not actually used.
+const uint8_t pattern_size_table[8] = {0, 16, 16, 15, 16, 15, 12, 14};
 
-// Fallback for when we need to copy while extending the pattern, for example
-// copying 10 bytes from 3 positions back abc -> abcabcabcabca.
-//
-// REQUIRES: [dst - offset, dst + 64) is a valid address range.
-SNAPPY_ATTRIBUTE_ALWAYS_INLINE
-static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
-#if SNAPPY_HAVE_SSSE3
-  if (SNAPPY_PREDICT_TRUE(offset <= 16)) {
-    switch (offset) {
-      case 0:
-        return false;
-      case 1: {
-        std::memset(dst, dst[-1], 64);
-        return true;
-      }
-      case 2:
-      case 4:
-      case 8:
-      case 16: {
-        __m128i pattern = LoadPattern(dst - offset, offset);
-        for (int i = 0; i < 4; i++) {
-          _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern);
-        }
-        return true;
-      }
-      default: {
-        auto pattern_and_reshuffle_mask =
-            LoadPatternAndReshuffleMask(dst - offset, offset);
-        __m128i pattern = pattern_and_reshuffle_mask.first;
-        __m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
-        for (int i = 0; i < 4; i++) {
-          _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern);
-          pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
-        }
-        return true;
-      }
-    }
-  }
-#else
-  if (SNAPPY_PREDICT_TRUE(offset < 16)) {
-    if (SNAPPY_PREDICT_FALSE(offset == 0)) return false;
-    // Extend the pattern to the first 16 bytes.
-    for (int i = 0; i < 16; i++) dst[i] = dst[i - offset];
-    // Find a multiple of pattern >= 16.
-    static std::array<uint8_t, 16> pattern_sizes = []() {
-      std::array<uint8_t, 16> res;
-      for (int i = 1; i < 16; i++) res[i] = (16 / i + 1) * i;
-      return res;
-    }();
-    offset = pattern_sizes[offset];
-    for (int i = 1; i < 4; i++) {
-      std::memcpy(dst + i * 16, dst + i * 16 - offset, 16);
-    }
-    return true;
-  }
 #endif  // SNAPPY_HAVE_SSSE3
 
-  // Very rare.
-  for (int i = 0; i < 4; i++) {
-    std::memcpy(dst + i * 16, dst + i * 16 - offset, 16);
-  }
-  return true;
-}
-
 // Copy [src, src+(op_limit-op)) to [op, op_limit) but faster than
 // IncrementalCopySlow. buf_limit is the address past the end of the writable
 // region of the buffer.
 inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
                              char* const buf_limit) {
-#if SNAPPY_HAVE_SSSE3
-  constexpr int big_pattern_size_lower_bound = 16;
-#else
-  constexpr int big_pattern_size_lower_bound = 8;
-#endif
-
   // Terminology:
   //
   // slop = buf_limit - op
   // pat  = op - src
   // len  = limit - op
   assert(src < op);
-  assert(op < op_limit);
+  assert(op <= op_limit);
   assert(op_limit <= buf_limit);
   // NOTE: The copy tags use 3 or 6 bits to store the copy length, so len <= 64.
   assert(op_limit - op <= 64);
@@ -404,13 +265,11 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   // input. In general if we always predict len <= 16 it would be an ok
   // prediction.
   //
-  // In order to be fast we want a pattern >= 16 bytes (or 8 bytes in non-SSE)
-  // and an unrolled loop copying 1x 16 bytes (or 2x 8 bytes in non-SSE) at a
-  // time.
+  // In order to be fast we want a pattern >= 8 bytes and an unrolled loop
+  // copying 2x 8 bytes at a time.
 
-  // Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE)
-  // bytes.
-  if (pattern_size < big_pattern_size_lower_bound) {
+  // Handle the uncommon case where pattern is less than 8 bytes.
+  if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
 #if SNAPPY_HAVE_SSSE3
     // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
     // to permute the register's contents in-place into a repeating sequence of
@@ -424,53 +283,22 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     // The non-SSE fallback implementation suffers from store-forwarding stalls
     // because its loads and stores partly overlap. By expanding the pattern
     // in-place, we avoid the penalty.
-
-    // Typically, the op_limit is the gating factor so try to simplify the loop
-    // based on that.
-    if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
-      auto pattern_and_reshuffle_mask =
-          LoadPatternAndReshuffleMask(src, pattern_size);
-      __m128i pattern = pattern_and_reshuffle_mask.first;
-      __m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
-
-      // There is at least one, and at most four 16-byte blocks. Writing four
-      // conditionals instead of a loop allows FDO to layout the code with
-      // respect to the actual probabilities of each length.
-      // TODO: Replace with loop with trip count hint.
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
-
-      if (op + 16 < op_limit) {
-        pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 16), pattern);
-      }
-      if (op + 32 < op_limit) {
-        pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 32), pattern);
-      }
-      if (op + 48 < op_limit) {
-        pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 48), pattern);
-      }
-      return op_limit;
-    }
-    char* const op_end = buf_limit - 15;
-    if (SNAPPY_PREDICT_TRUE(op < op_end)) {
-      auto pattern_and_reshuffle_mask =
-          LoadPatternAndReshuffleMask(src, pattern_size);
-      __m128i pattern = pattern_and_reshuffle_mask.first;
-      __m128i reshuffle_mask = pattern_and_reshuffle_mask.second;
-
-      // This code path is relatively cold however so we save code size
-      // by avoiding unrolling and vectorizing.
-      //
-      // TODO: Remove pragma when when cold regions don't get
-      // vectorized or unrolled.
-#pragma nounroll
-      do {
+    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
+      const __m128i shuffle_mask = _mm_load_si128(
+          reinterpret_cast<const __m128i*>(pshufb_fill_patterns) +
+          pattern_size - 1);
+      const __m128i pattern = _mm_shuffle_epi8(
+          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
+      // Uninitialized bytes are masked out by the shuffle mask.
+      // TODO: remove annotation and macro defs once MSan is fixed.
+      SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
+      pattern_size = pattern_size_table[pattern_size];
+      char* op_end = std::min(op_limit, buf_limit - 15);
+      while (op < op_end) {
         _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
-        pattern = _mm_shuffle_epi8(pattern, reshuffle_mask);
-        op += 16;
-      } while (SNAPPY_PREDICT_TRUE(op < op_end));
+        op += pattern_size;
+      }
+      if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
     }
     return IncrementalCopySlow(src, op, op_limit);
 #else   // !SNAPPY_HAVE_SSSE3
@@ -492,30 +320,34 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     }
 #endif  // SNAPPY_HAVE_SSSE3
   }
-  assert(pattern_size >= big_pattern_size_lower_bound);
-  constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16;
+  assert(pattern_size >= 8);
 
-  // Copy 1x 16 bytes (or 2x 8 bytes in non-SSE) at a time. Because op - src can
-  // be < 16 in non-SSE, a single UnalignedCopy128 might overwrite data in op.
-  // UnalignedCopy64 is safe because expanding the pattern to at least 8 bytes
-  // guarantees that op - src >= 8.
+  // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
+  // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
+  // because expanding the pattern to at least 8 bytes guarantees that
+  // op - src >= 8.
   //
   // Typically, the op_limit is the gating factor so try to simplify the loop
   // based on that.
-  if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
+  if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
     // There is at least one, and at most four 16-byte blocks. Writing four
     // conditionals instead of a loop allows FDO to layout the code with respect
     // to the actual probabilities of each length.
     // TODO: Replace with loop with trip count hint.
-    ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
+    UnalignedCopy64(src, op);
+    UnalignedCopy64(src + 8, op + 8);
+
     if (op + 16 < op_limit) {
-      ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 16, op + 16);
+      UnalignedCopy64(src + 16, op + 16);
+      UnalignedCopy64(src + 24, op + 24);
     }
     if (op + 32 < op_limit) {
-      ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 32, op + 32);
+      UnalignedCopy64(src + 32, op + 32);
+      UnalignedCopy64(src + 40, op + 40);
     }
     if (op + 48 < op_limit) {
-      ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 48, op + 48);
+      UnalignedCopy64(src + 48, op + 48);
+      UnalignedCopy64(src + 56, op + 56);
     }
     return op_limit;
   }
@@ -526,9 +358,12 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   //
   // TODO: Remove pragma when when cold regions don't get vectorized
   // or unrolled.
-#pragma nounroll
+#ifdef __clang__
+#pragma clang loop unroll(disable)
+#endif
   for (char* op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
-    ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
+    UnalignedCopy64(src, op);
+    UnalignedCopy64(src + 8, op + 8);
   }
   if (op >= op_limit) return op_limit;
 
@@ -1059,10 +894,10 @@ std::pair<const uint8_t*, char*> DecompressBranchless(
       if (SNAPPY_PREDICT_FALSE(std::size_t(offset) < len)) {
         assert(tag_type != 0);
         // offset 0 is an error.
-        if (!Copy64BytesWithPatternExtension(op_base + op, offset)) {
-          break;
-        }
-        op += len;
+        if (SNAPPY_PREDICT_FALSE(offset == 0)) break;
+        op = IncrementalCopy(op_base + delta, op_base + op, op_base + op + len,
+                             op_base + op_limit) -
+             op_base;
         continue;
       }
 
@@ -1254,7 +1089,6 @@ class SnappyDecompressor {
           preload = LittleEndian::Load32(ip);
           const uint32_t trailer = ExtractLowBytes(preload, c & 3);
           const uint32_t length = entry & 0xff;
-          assert(length > 0);
 
           // copy_offset/256 is encoded in bits 8..10.  By just fetching
           // those bits, we get copy_offset (since the bit-field starts at
@@ -1625,7 +1459,6 @@ class SnappyIOVecWriter {
         if (to_copy > len) {
           to_copy = len;
         }
-        assert(to_copy > 0);
 
         IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
                         curr_iov_output_, curr_iov_output_ + to_copy,
@@ -1719,7 +1552,6 @@ class SnappyArrayWriter {
 
   SNAPPY_ATTRIBUTE_ALWAYS_INLINE
   inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) {
-    assert(len > 0);
     char* const op = *op_p;
     assert(op >= base_);
     char* const op_end = op + len;
author	Snappy Team <no-reply@google.com>	2020-12-03 03:15:10 +0000
committer	Victor Costan <costan@google.com>	2020-12-03 22:52:52 +0000
commit	56c2c247d037ce4b274c93e4bb638662cb79378a (patch)
tree	cb7ba247d2a8f979a57d0894dc3b080d0d866668
parent	a94be58e65dac5d676c219199b015cfc32d7ae6e (diff)
download	snappy-git-56c2c247d037ce4b274c93e4bb638662cb79378a.tar.gz