Optimize zippy unzipping by upto >10% by making IncrementalCopy faster.

When SSSE3 is available: - Use PSHUFB (_mm_shuffle_epi8) to handle pattern size 1 to 15 (previously it handled size 1 to 7). - This enables us to do 16 byte copies instead of 8 bytes copies because we know that the pattern size >= 16. - Use shuffle-reshuffle strategy to generate the next pattern after loading the initial pattern. This enables us to write 4 conditionals (similar to when pattern size >= 16) which would allow FDO to layout the code with respect to actual probabilities of each length. - The PSHUFB masks are now generated programmatically at compile-time. When SSSE3 is unavailable: - No change. In both cases: - assert(op < op_limit) in IncrementalCopy so that we can check 'op_limit <= buf_limit - 15' instead of 'op_limit <= buf_limit - 16'. All existing call sites of IncrementalCopy guarantee this. PiperOrigin-RevId: 342267037
author: Shahriar Rouf <nafi@google.com> 2020-11-13 16:26:13 +0000
committer: Victor Costan <costan@google.com> 2020-11-18 23:21:21 +0000
commit: 4d2dc9dcbb4c4c68872e9a2952426dd7dee91e56 (patch)
tree: 95d0f27c72a092f8372e57c15d6b51fa88072c03
parent: 11e5165b98c32038fad44ee282619484ed3b80da (diff)
download: snappy-git-4d2dc9dcbb4c4c68872e9a2952426dd7dee91e56.tar.gz
1 files changed, 140 insertions, 54 deletions
diff --git a/snappy.cc b/snappy.cc
index 41776a9..b0e4414 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -137,6 +137,16 @@ void UnalignedCopy128(const void* src, void* dst) {
   std::memcpy(dst, tmp, 16);
 }
 
+template <bool use_16bytes_chunk>
+inline void ConditionalUnalignedCopy128(const char* src, char* dst) {
+  if (use_16bytes_chunk) {
+    UnalignedCopy128(src, dst);
+  } else {
+    UnalignedCopy64(src, dst);
+    UnalignedCopy64(src + 8, dst + 8);
+  }
+}
+
 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
 // for handling COPY operations where the input and output regions may overlap.
 // For example, suppose:
@@ -164,21 +174,54 @@ inline char* IncrementalCopySlow(const char* src, char* op,
 
 #if SNAPPY_HAVE_SSSE3
 
-// This is a table of shuffle control masks that can be used as the source
+// Computes the bytes for shuffle control mask (please read comments on
+// 'pattern_generation_masks' as well) for the given index_offset and
+// pattern_size. For example, when the 'offset' is 6, it will generate a
+// repeating pattern of size 6. So, the first 16 byte indexes will correspond to
+// the pattern-bytes {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3} and the
+// next 16 byte indexes will correspond to the pattern-bytes {4, 5, 0, 1, 2, 3,
+// 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
+// calling MakePatternMaskBytes(0, 6, std::make_integer_sequence<int, 16>()) and
+// MakePatternMaskBytes(16, 6, std::make_integer_sequence<int, 16>())
+// respectively.
+template <int... indexes>
+constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
+    int index_offset, int pattern_size,
+    std::integer_sequence<int, indexes...>) {
+  return {static_cast<char>((index_offset + indexes) % pattern_size)...};
+}
+
+// Computes the shuffle control mask bytes array for given pattern-sizes and
+// returns an array.
+template <int... pattern_sizes_minus_one>
+constexpr std::array<std::array<char, sizeof(__m128i)>,
+                     sizeof...(pattern_sizes_minus_one)>
+MakePatternMaskBytesTable(
+    int index_offset, std::integer_sequence<int, pattern_sizes_minus_one...>) {
+  return {MakePatternMaskBytes(
+      index_offset, pattern_sizes_minus_one + 1,
+      std::make_integer_sequence<int, /*indexes=*/sizeof(__m128i)>())...};
+}
+
+// This is an array of shuffle control masks that can be used as the source
 // operand for PSHUFB to permute the contents of the destination XMM register
 // into a repeating byte pattern.
-alignas(16) const char pshufb_fill_patterns[7][16] = {
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
-    {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
-    {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
-    {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
-    {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
-};
-
-// j * (16 / j) for all j from 0 to 7. 0 is not actually used.
-const uint8_t pattern_size_table[8] = {0, 16, 16, 15, 16, 15, 12, 14};
+alignas(16) inline constexpr std::array<std::array<char, sizeof(__m128i)>,
+                                        16> pattern_generation_masks =
+    MakePatternMaskBytesTable(
+        /*index_offset=*/0,
+        /*pattern_sizes_minus_one=*/std::make_integer_sequence<int, 16>());
+
+// Similar to 'pattern_generation_masks', this table is used to "rotate" the
+// pattern so that we can copy the *next 16 bytes* consistent with the pattern.
+// Basically, pattern_reshuffle_masks is a continuation of
+// pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
+// pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
+alignas(16) inline constexpr std::array<std::array<char, sizeof(__m128i)>,
+                                        16> pattern_reshuffle_masks =
+    MakePatternMaskBytesTable(
+        /*index_offset=*/16,
+        /*pattern_sizes_minus_one=*/std::make_integer_sequence<int, 16>());
 
 #endif  // SNAPPY_HAVE_SSSE3
 
@@ -187,13 +230,19 @@ const uint8_t pattern_size_table[8] = {0, 16, 16, 15, 16, 15, 12, 14};
 // region of the buffer.
 inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
                              char* const buf_limit) {
+#if SNAPPY_HAVE_SSSE3
+  constexpr int big_pattern_size_lower_bound = 16;
+#else
+  constexpr int big_pattern_size_lower_bound = 8;
+#endif
+
   // Terminology:
   //
   // slop = buf_limit - op
   // pat  = op - src
   // len  = limit - op
   assert(src < op);
-  assert(op <= op_limit);
+  assert(op < op_limit);
   assert(op_limit <= buf_limit);
   // NOTE: The copy tags use 3 or 6 bits to store the copy length, so len <= 64.
   assert(op_limit - op <= 64);
@@ -224,11 +273,13 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   // input. In general if we always predict len <= 16 it would be an ok
   // prediction.
   //
-  // In order to be fast we want a pattern >= 8 bytes and an unrolled loop
-  // copying 2x 8 bytes at a time.
+  // In order to be fast we want a pattern >= 16 bytes (or 8 bytes in non-SSE)
+  // and an unrolled loop copying 1x 16 bytes (or 2x 8 bytes in non-SSE) at a
+  // time.
 
-  // Handle the uncommon case where pattern is less than 8 bytes.
-  if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
+  // Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE)
+  // bytes.
+  if (SNAPPY_PREDICT_FALSE(pattern_size < big_pattern_size_lower_bound)) {
 #if SNAPPY_HAVE_SSSE3
     // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
     // to permute the register's contents in-place into a repeating sequence of
@@ -242,24 +293,63 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     // The non-SSE fallback implementation suffers from store-forwarding stalls
     // because its loads and stores partly overlap. By expanding the pattern
     // in-place, we avoid the penalty.
-    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
-      const __m128i shuffle_mask = _mm_load_si128(
-          reinterpret_cast<const __m128i*>(pshufb_fill_patterns) +
-          pattern_size - 1);
-      const __m128i pattern = _mm_shuffle_epi8(
-          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
-      // Uninitialized bytes are masked out by the shuffle mask.
-      // TODO: remove annotation and macro defs once MSan is fixed.
-      SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
-      pattern_size = pattern_size_table[pattern_size];
-      char* op_end = std::min(op_limit, buf_limit - 15);
-      while (op < op_end) {
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
-        op += pattern_size;
+    const __m128i pattern_generation_mask =
+        _mm_load_si128(reinterpret_cast<const __m128i*>(
+            pattern_generation_masks[pattern_size - 1].data()));
+    // Uninitialized bytes are masked out by the shuffle mask.
+    // TODO: remove annotation and macro defs once MSan is fixed.
+    SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(op, 16 - pattern_size);
+    __m128i pattern =
+        _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(src)),
+                         pattern_generation_mask);
+
+    // In addition to the 'pattern_generation_mask', this mask will generate the
+    // next 16 bytes in-place. Doing so enables us to write 4 conditionals
+    // similar to when 'pattern_size >= big_pattern_size_lower_bound'.
+    // For example, suppose pattern is abcabcabcabcabca. Shuffling with this
+    // mask will generate bcabcabcabcabcab. Shuffling again will generate
+    // cabcabcabcabcabc.
+    const __m128i pattern_reshuffle_mask =
+        _mm_load_si128(reinterpret_cast<const __m128i*>(
+            pattern_reshuffle_masks[pattern_size - 1].data()));
+
+    // Typically, the op_limit is the gating factor so try to simplify the loop
+    // based on that.
+    if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
+      // There is at least one, and at most four 16-byte blocks. Writing four
+      // conditionals instead of a loop allows FDO to layout the code with
+      // respect to the actual probabilities of each length.
+      // TODO: Replace with loop with trip count hint.
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
+
+      if (op + 16 < op_limit) {
+        pattern = _mm_shuffle_epi8(pattern, pattern_reshuffle_mask);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 16), pattern);
       }
-      if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
+      if (op + 32 < op_limit) {
+        pattern = _mm_shuffle_epi8(pattern, pattern_reshuffle_mask);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 32), pattern);
+      }
+      if (op + 48 < op_limit) {
+        pattern = _mm_shuffle_epi8(pattern, pattern_reshuffle_mask);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 48), pattern);
+      }
+      return op_limit;
+    }
+
+    // Fall back to doing as much as we can with the available slop in the
+    // buffer. This code path is relatively cold however so we save code size by
+    // avoiding unrolling and vectorizing.
+    //
+    // TODO: Remove pragma when when cold regions don't get
+    // vectorized or unrolled.
+#pragma nounroll
+    while (op + 16 <= op_limit) {
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
+      pattern = _mm_shuffle_epi8(pattern, pattern_reshuffle_mask);
+      op += 16;
     }
-    return IncrementalCopySlow(src, op, op_limit);
+    return IncrementalCopySlow(op - pattern_size, op, op_limit);
 #else   // !SNAPPY_HAVE_SSSE3
     // If plenty of buffer space remains, expand the pattern to at least 8
     // bytes. The way the following loop is written, we need 8 bytes of buffer
@@ -279,34 +369,30 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     }
 #endif  // SNAPPY_HAVE_SSSE3
   }
-  assert(pattern_size >= 8);
+  assert(pattern_size >= big_pattern_size_lower_bound);
+  constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16;
 
-  // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
-  // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
-  // because expanding the pattern to at least 8 bytes guarantees that
-  // op - src >= 8.
+  // Copy 1x 16 bytes (or 2x 8 bytes in non-SSE) at a time. Because op - src can
+  // be < 16 in non-SSE, a single UnalignedCopy128 might overwrite data in op.
+  // UnalignedCopy64 is safe because expanding the pattern to at least 8 bytes
+  // guarantees that op - src >= 8.
   //
   // Typically, the op_limit is the gating factor so try to simplify the loop
   // based on that.
-  if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
+  if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
     // There is at least one, and at most four 16-byte blocks. Writing four
     // conditionals instead of a loop allows FDO to layout the code with respect
     // to the actual probabilities of each length.
     // TODO: Replace with loop with trip count hint.
-    UnalignedCopy64(src, op);
-    UnalignedCopy64(src + 8, op + 8);
-
+    ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
     if (op + 16 < op_limit) {
-      UnalignedCopy64(src + 16, op + 16);
-      UnalignedCopy64(src + 24, op + 24);
+      ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 16, op + 16);
     }
     if (op + 32 < op_limit) {
-      UnalignedCopy64(src + 32, op + 32);
-      UnalignedCopy64(src + 40, op + 40);
+      ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 32, op + 32);
     }
     if (op + 48 < op_limit) {
-      UnalignedCopy64(src + 48, op + 48);
-      UnalignedCopy64(src + 56, op + 56);
+      ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 48, op + 48);
     }
     return op_limit;
   }
@@ -317,12 +403,9 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   //
   // TODO: Remove pragma when when cold regions don't get vectorized
   // or unrolled.
-#ifdef __clang__
-#pragma clang loop unroll(disable)
-#endif
+#pragma nounroll
   for (char* op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
-    UnalignedCopy64(src, op);
-    UnalignedCopy64(src + 8, op + 8);
+    ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
   }
   if (op >= op_limit) return op_limit;
 
@@ -916,6 +999,7 @@ class SnappyDecompressor {
           preload = LittleEndian::Load32(ip);
           const uint32_t trailer = ExtractLowBytes(preload, c & 3);
           const uint32_t length = entry & 0xff;
+          assert(length > 0);
 
           // copy_offset/256 is encoded in bits 8..10.  By just fetching
           // those bits, we get copy_offset (since the bit-field starts at
@@ -1261,6 +1345,7 @@ class SnappyIOVecWriter {
         if (to_copy > len) {
           to_copy = len;
         }
+        assert(to_copy > 0);
 
         IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
                         curr_iov_output_, curr_iov_output_ + to_copy,
@@ -1350,6 +1435,7 @@ class SnappyArrayWriter {
 
   SNAPPY_ATTRIBUTE_ALWAYS_INLINE
   inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) {
+    assert(len > 0);
     char* const op = *op_p;
     assert(op >= base_);
     char* const op_end = op + len;
author	Shahriar Rouf <nafi@google.com>	2020-11-13 16:26:13 +0000
committer	Victor Costan <costan@google.com>	2020-11-18 23:21:21 +0000
commit	4d2dc9dcbb4c4c68872e9a2952426dd7dee91e56 (patch)
tree	95d0f27c72a092f8372e57c15d6b51fa88072c03
parent	11e5165b98c32038fad44ee282619484ed3b80da (diff)
download	snappy-git-4d2dc9dcbb4c4c68872e9a2952426dd7dee91e56.tar.gz