Improve performance of zippy decompression to IOVecs by up to almost 50%

1) Simplify loop condition for small pattern IncrementalCopy 2) Use pointers rather than indices to track current iovec. 3) Use fast IncrementalCopy 4) Bypass Append check from within AppendFromSelf While this code greatly improves the performance of ZippyIOVecWriter, a bigger question is whether IOVec writing should be improved, or removed. Perf tests: name old speed new speed delta BM_UFlat/0 [html ] 2.13GB/s ± 0% 2.14GB/s ± 1% ~ BM_UFlat/1 [urls ] 1.22GB/s ± 0% 1.24GB/s ± 0% +1.87% BM_UFlat/2 [jpg ] 17.2GB/s ± 1% 17.1GB/s ± 0% ~ BM_UFlat/3 [jpg_200 ] 1.55GB/s ± 0% 1.53GB/s ± 2% ~ BM_UFlat/4 [pdf ] 12.8GB/s ± 1% 12.7GB/s ± 2% -0.36% BM_UFlat/5 [html4 ] 1.89GB/s ± 0% 1.90GB/s ± 1% ~ BM_UFlat/6 [txt1 ] 811MB/s ± 0% 829MB/s ± 1% +2.24% BM_UFlat/7 [txt2 ] 756MB/s ± 0% 774MB/s ± 1% +2.41% BM_UFlat/8 [txt3 ] 860MB/s ± 0% 879MB/s ± 1% +2.16% BM_UFlat/9 [txt4 ] 699MB/s ± 0% 715MB/s ± 1% +2.31% BM_UFlat/10 [pb ] 2.64GB/s ± 0% 2.65GB/s ± 1% ~ BM_UFlat/11 [gaviota ] 1.00GB/s ± 0% 0.99GB/s ± 2% ~ BM_UFlat/12 [cp ] 1.66GB/s ± 1% 1.66GB/s ± 2% ~ BM_UFlat/13 [c ] 1.53GB/s ± 0% 1.47GB/s ± 5% -3.97% BM_UFlat/14 [lsp ] 1.60GB/s ± 1% 1.55GB/s ± 5% -3.41% BM_UFlat/15 [xls ] 1.12GB/s ± 0% 1.15GB/s ± 0% +1.93% BM_UFlat/16 [xls_200 ] 918MB/s ± 2% 929MB/s ± 1% +1.15% BM_UFlat/17 [bin ] 1.86GB/s ± 0% 1.89GB/s ± 1% +1.61% BM_UFlat/18 [bin_200 ] 1.90GB/s ± 1% 1.97GB/s ± 1% +3.67% BM_UFlat/19 [sum ] 1.32GB/s ± 0% 1.33GB/s ± 1% ~ BM_UFlat/20 [man ] 1.39GB/s ± 0% 1.36GB/s ± 3% ~ BM_UValidate/0 [html ] 2.85GB/s ± 3% 2.90GB/s ± 0% ~ BM_UValidate/1 [urls ] 1.57GB/s ± 0% 1.56GB/s ± 0% -0.20% BM_UValidate/2 [jpg ] 824GB/s ± 0% 825GB/s ± 0% +0.11% BM_UValidate/3 [jpg_200 ] 2.01GB/s ± 0% 2.02GB/s ± 0% +0.10% BM_UValidate/4 [pdf ] 30.4GB/s ±11% 33.5GB/s ± 0% ~ BM_UIOVec/0 [html ] 604MB/s ± 0% 856MB/s ± 0% +41.70% BM_UIOVec/1 [urls ] 440MB/s ± 0% 660MB/s ± 0% +49.91% BM_UIOVec/2 [jpg ] 15.1GB/s ± 1% 15.3GB/s ± 1% +1.22% BM_UIOVec/3 [jpg_200 ] 567MB/s ± 1% 629MB/s ± 0% +10.89% BM_UIOVec/4 [pdf ] 7.16GB/s ± 2% 8.56GB/s ± 1% +19.64% BM_UFlatSink/0 [html ] 2.13GB/s ± 0% 2.16GB/s ± 0% +1.47% BM_UFlatSink/1 [urls ] 1.22GB/s ± 0% 1.25GB/s ± 0% +2.18% BM_UFlatSink/2 [jpg ] 17.1GB/s ± 2% 17.1GB/s ± 2% ~ BM_UFlatSink/3 [jpg_200 ] 1.51GB/s ± 1% 1.53GB/s ± 2% +1.11% BM_UFlatSink/4 [pdf ] 12.7GB/s ± 2% 12.8GB/s ± 1% +0.67% BM_UFlatSink/5 [html4 ] 1.90GB/s ± 0% 1.92GB/s ± 0% +1.31% BM_UFlatSink/6 [txt1 ] 810MB/s ± 0% 835MB/s ± 0% +3.04% BM_UFlatSink/7 [txt2 ] 755MB/s ± 0% 779MB/s ± 0% +3.19% BM_UFlatSink/8 [txt3 ] 859MB/s ± 0% 884MB/s ± 0% +2.86% BM_UFlatSink/9 [txt4 ] 698MB/s ± 0% 718MB/s ± 0% +2.96% BM_UFlatSink/10 [pb ] 2.64GB/s ± 0% 2.67GB/s ± 0% +1.16% BM_UFlatSink/11 [gaviota ] 1.00GB/s ± 0% 1.01GB/s ± 0% +1.04% BM_UFlatSink/12 [cp ] 1.66GB/s ± 1% 1.68GB/s ± 1% +0.83% BM_UFlatSink/13 [c ] 1.52GB/s ± 1% 1.53GB/s ± 0% +0.38% BM_UFlatSink/14 [lsp ] 1.60GB/s ± 1% 1.61GB/s ± 0% +0.91% BM_UFlatSink/15 [xls ] 1.12GB/s ± 0% 1.15GB/s ± 0% +1.96% BM_UFlatSink/16 [xls_200 ] 906MB/s ± 3% 920MB/s ± 1% +1.55% BM_UFlatSink/17 [bin ] 1.86GB/s ± 0% 1.90GB/s ± 0% +2.15% BM_UFlatSink/18 [bin_200 ] 1.85GB/s ± 2% 1.92GB/s ± 2% +4.01% BM_UFlatSink/19 [sum ] 1.32GB/s ± 1% 1.35GB/s ± 0% +2.23% BM_UFlatSink/20 [man ] 1.39GB/s ± 1% 1.40GB/s ± 0% +1.12% BM_ZFlat/0 [html (22.31 %) ] 800MB/s ± 0% 793MB/s ± 0% -0.95% BM_ZFlat/1 [urls (47.78 %) ] 423MB/s ± 0% 424MB/s ± 0% +0.11% BM_ZFlat/2 [jpg (99.95 %) ] 12.0GB/s ± 2% 12.0GB/s ± 4% ~ BM_ZFlat/3 [jpg_200 (73.00 %)] 592MB/s ± 3% 594MB/s ± 2% ~ BM_ZFlat/4 [pdf (83.30 %) ] 7.26GB/s ± 1% 7.23GB/s ± 2% -0.49% BM_ZFlat/5 [html4 (22.52 %) ] 738MB/s ± 0% 739MB/s ± 0% +0.17% BM_ZFlat/6 [txt1 (57.88 %) ] 286MB/s ± 0% 285MB/s ± 0% -0.09% BM_ZFlat/7 [txt2 (61.91 %) ] 264MB/s ± 0% 264MB/s ± 0% +0.08% BM_ZFlat/8 [txt3 (54.99 %) ] 300MB/s ± 0% 300MB/s ± 0% ~ BM_ZFlat/9 [txt4 (66.26 %) ] 248MB/s ± 0% 247MB/s ± 0% -0.20% BM_ZFlat/10 [pb (19.68 %) ] 1.04GB/s ± 0% 1.03GB/s ± 0% -1.17% BM_ZFlat/11 [gaviota (37.72 %)] 451MB/s ± 0% 450MB/s ± 0% -0.35% BM_ZFlat/12 [cp (48.12 %) ] 543MB/s ± 0% 538MB/s ± 0% -1.04% BM_ZFlat/13 [c (42.47 %) ] 638MB/s ± 1% 643MB/s ± 0% +0.68% BM_ZFlat/14 [lsp (48.37 %) ] 686MB/s ± 0% 691MB/s ± 1% +0.76% BM_ZFlat/15 [xls (41.23 %) ] 636MB/s ± 0% 633MB/s ± 0% -0.52% BM_ZFlat/16 [xls_200 (78.00 %)] 523MB/s ± 2% 520MB/s ± 2% -0.56% BM_ZFlat/17 [bin (18.11 %) ] 1.01GB/s ± 0% 1.01GB/s ± 0% +0.50% BM_ZFlat/18 [bin_200 (7.50 %) ] 2.45GB/s ± 1% 2.44GB/s ± 1% -0.54% BM_ZFlat/19 [sum (48.96 %) ] 487MB/s ± 0% 478MB/s ± 0% -1.89% BM_ZFlat/20 [man (59.21 %) ] 567MB/s ± 1% 566MB/s ± 1% ~ The BM_UFlat/13 and BM_UFlat/14 results showed high variance, so I reran them: name old speed new speed delta BM_UFlat/13 [c ] 1.53GB/s ± 0% 1.53GB/s ± 1% ~ BM_UFlat/14 [lsp] 1.61GB/s ± 1% 1.61GB/s ± 1% +0.25%
author: jefflim <jefflim@google.com> 2018-08-07 18:39:54 -0700
committer: Victor Costan <pwnall@chromium.org> 2018-08-07 23:41:17 -0700
commit: 27ff0af12a82c9970090bf960ff6d7863774ef6c (patch)
tree: 6a16460173711b72bb82245fa47ea3c3c0afbb94
parent: 4ffb0e62c56b2643c4fa4c24b0585182fa803815 (diff)
download: snappy-git-27ff0af12a82c9970090bf960ff6d7863774ef6c.tar.gz
2 files changed, 66 insertions, 73 deletions
diff --git a/snappy.cc b/snappy.cc
index 34fed24..a5703a3 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -30,16 +30,7 @@
 #include "snappy-internal.h"
 #include "snappy-sinksource.h"
 
-#ifndef SNAPPY_HAVE_SSE2
-#if defined(__SSE2__) || defined(_M_X64) || \
-    (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
-#define SNAPPY_HAVE_SSE2 1
-#else
-#define SNAPPY_HAVE_SSE2 0
-#endif
-#endif
-
-#if SNAPPY_HAVE_SSE2
+#if defined(__SSSE3__)
 #include <x86intrin.h>
 #endif
 #include <stdio.h>
@@ -104,16 +95,9 @@ void UnalignedCopy64(const void* src, void* dst) {
 }
 
 void UnalignedCopy128(const void* src, void* dst) {
-  // TODO(alkis): Remove this when we upgrade to a recent compiler that emits
-  // SSE2 moves for memcpy(dst, src, 16).
-#if SNAPPY_HAVE_SSE2
-  __m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
-  _mm_storeu_si128(static_cast<__m128i*>(dst), x);
-#else
   char tmp[16];
   memcpy(tmp, src, 16);
   memcpy(dst, tmp, 16);
-#endif
 }
 
 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
@@ -183,7 +167,7 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
 
   // Handle the uncommon case where pattern is less than 8 bytes.
   if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
-#if defined __SSSE3__
+#if defined(__SSSE3__)
     // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
     // to permute the register's contents in-place into a repeating sequence of
     // the first "pattern_size" bytes.
@@ -205,7 +189,8 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
       // Uninitialized bytes are masked out by the shuffle mask.
       SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
       pattern_size *= 16 / pattern_size;
-      while (op < op_limit && op <= buf_limit - 16) {
+      char* op_end = std::min(op_limit, buf_limit - 15);
+      while (op < op_end) {
         _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
         op += pattern_size;
       }
@@ -1031,13 +1016,19 @@ size_t Compress(Source* reader, Sink* writer) {
 class SnappyIOVecWriter {
  private:
   const struct iovec* output_iov_;
-  const size_t output_iov_count_;
 
-  // We are currently writing into output_iov_[curr_iov_index_].
-  size_t curr_iov_index_;
+  // output_iov_end_ is set to iov + count and used to determine when
+  // the end of the iovs is reached.
+  const struct iovec* output_iov_end_;
+
+  // Current iov that is being written into.
+  const struct iovec* curr_iov_;
 
-  // Bytes written to output_iov_[curr_iov_index_] so far.
-  size_t curr_iov_written_;
+  // Pointer to current iov's write location.
+  char* curr_iov_output_;
+
+  // Remaining bytes to write into curr_iov_output.
+  size_t curr_iov_remaining_;
 
   // Total bytes decompressed into output_iov_ so far.
   size_t total_written_;
@@ -1045,9 +1036,8 @@ class SnappyIOVecWriter {
   // Maximum number of bytes that will be decompressed into output_iov_.
   size_t output_limit_;
 
-  inline char* GetIOVecPointer(size_t index, size_t offset) {
-    return reinterpret_cast<char*>(output_iov_[index].iov_base) +
-        offset;
+  static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
+    return reinterpret_cast<char*>(iov->iov_base) + offset;
   }
 
  public:
@@ -1055,12 +1045,13 @@ class SnappyIOVecWriter {
   // entire lifetime of the SnappyIOVecWriter.
   inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
       : output_iov_(iov),
-        output_iov_count_(iov_count),
-        curr_iov_index_(0),
-        curr_iov_written_(0),
+        output_iov_end_(iov + iov_count),
+        curr_iov_(iov),
+        curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
+                                   : nullptr),
+        curr_iov_remaining_(iov_count ? iov->iov_len : 0),
         total_written_(0),
-        output_limit_(-1) {
-  }
+        output_limit_(-1) {}
 
   inline void SetExpectedLength(size_t len) {
     output_limit_ = len;
@@ -1075,23 +1066,25 @@ class SnappyIOVecWriter {
       return false;
     }
 
+    return AppendNoCheck(ip, len);
+  }
+
+  inline bool AppendNoCheck(const char* ip, size_t len) {
     while (len > 0) {
-      assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
-      if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
+      if (curr_iov_remaining_ == 0) {
         // This iovec is full. Go to the next one.
-        if (curr_iov_index_ + 1 >= output_iov_count_) {
+        if (curr_iov_ + 1 >= output_iov_end_) {
           return false;
         }
-        curr_iov_written_ = 0;
-        ++curr_iov_index_;
+        ++curr_iov_;
+        curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
+        curr_iov_remaining_ = curr_iov_->iov_len;
       }
 
-      const size_t to_write = std::min(
-          len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
-      memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
-             ip,
-             to_write);
-      curr_iov_written_ += to_write;
+      const size_t to_write = std::min(len, curr_iov_remaining_);
+      memcpy(curr_iov_output_, ip, to_write);
+      curr_iov_output_ += to_write;
+      curr_iov_remaining_ -= to_write;
       total_written_ += to_write;
       ip += to_write;
       len -= to_write;
@@ -1103,11 +1096,11 @@ class SnappyIOVecWriter {
   inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
     const size_t space_left = output_limit_ - total_written_;
     if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
-        output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
+        curr_iov_remaining_ >= 16) {
       // Fast path, used for the majority (about 95%) of invocations.
-      char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
-      UnalignedCopy128(ip, ptr);
-      curr_iov_written_ += len;
+      UnalignedCopy128(ip, curr_iov_output_);
+      curr_iov_output_ += len;
+      curr_iov_remaining_ -= len;
       total_written_ += len;
       return true;
     }
@@ -1116,7 +1109,9 @@ class SnappyIOVecWriter {
   }
 
   inline bool AppendFromSelf(size_t offset, size_t len) {
-    if (offset > total_written_ || offset == 0) {
+    // See SnappyArrayWriter::AppendFromSelf for an explanation of
+    // the "offset - 1u" trick.
+    if (offset - 1u >= total_written_) {
       return false;
     }
     const size_t space_left = output_limit_ - total_written_;
@@ -1125,8 +1120,8 @@ class SnappyIOVecWriter {
     }
 
     // Locate the iovec from which we need to start the copy.
-    size_t from_iov_index = curr_iov_index_;
-    size_t from_iov_offset = curr_iov_written_;
+    const iovec* from_iov = curr_iov_;
+    size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
     while (offset > 0) {
       if (from_iov_offset >= offset) {
         from_iov_offset -= offset;
@@ -1134,47 +1129,45 @@ class SnappyIOVecWriter {
       }
 
       offset -= from_iov_offset;
-      assert(from_iov_index > 0);
-      --from_iov_index;
-      from_iov_offset = output_iov_[from_iov_index].iov_len;
+      --from_iov;
+      assert(from_iov >= output_iov_);
+      from_iov_offset = from_iov->iov_len;
     }
 
     // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
     // the current iovec.
     while (len > 0) {
-      assert(from_iov_index <= curr_iov_index_);
-      if (from_iov_index != curr_iov_index_) {
-        const size_t to_copy = std::min(
-            output_iov_[from_iov_index].iov_len - from_iov_offset,
-            len);
-        Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
+      assert(from_iov <= curr_iov_);
+      if (from_iov != curr_iov_) {
+        const size_t to_copy =
+            std::min(from_iov->iov_len - from_iov_offset, len);
+        AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
         len -= to_copy;
         if (len > 0) {
-          ++from_iov_index;
+          ++from_iov;
           from_iov_offset = 0;
         }
       } else {
-        assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
-        size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
-                                      curr_iov_written_,
-                                  len);
+        size_t to_copy = curr_iov_remaining_;
         if (to_copy == 0) {
           // This iovec is full. Go to the next one.
-          if (curr_iov_index_ + 1 >= output_iov_count_) {
+          if (curr_iov_ + 1 >= output_iov_end_) {
             return false;
           }
-          ++curr_iov_index_;
-          curr_iov_written_ = 0;
+          ++curr_iov_;
+          curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
+          curr_iov_remaining_ = curr_iov_->iov_len;
           continue;
         }
         if (to_copy > len) {
           to_copy = len;
         }
-        IncrementalCopySlow(
-            GetIOVecPointer(from_iov_index, from_iov_offset),
-            GetIOVecPointer(curr_iov_index_, curr_iov_written_),
-            GetIOVecPointer(curr_iov_index_, curr_iov_written_) + to_copy);
-        curr_iov_written_ += to_copy;
+
+        IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
+                        curr_iov_output_, curr_iov_output_ + to_copy,
+                        curr_iov_output_ + curr_iov_remaining_);
+        curr_iov_output_ += to_copy;
+        curr_iov_remaining_ -= to_copy;
         from_iov_offset += to_copy;
         total_written_ += to_copy;
         len -= to_copy;
diff --git a/snappy_unittest.cc b/snappy_unittest.cc
index fcb3261..0548d2a 100644
--- a/snappy_unittest.cc
+++ b/snappy_unittest.cc
@@ -421,7 +421,7 @@ static void VerifyIOVec(const string& input) {
       if (rnd.OneIn(5)) {
         iov[i].iov_len = 0;
       } else {
-        iov[i].iov_len = rnd.Uniform(input.size());
+        iov[i].iov_len = rnd.Uniform(input.size() - used_so_far);
       }
     }
     used_so_far += iov[i].iov_len;
author	jefflim <jefflim@google.com>	2018-08-07 18:39:54 -0700
committer	Victor Costan <pwnall@chromium.org>	2018-08-07 23:41:17 -0700
commit	27ff0af12a82c9970090bf960ff6d7863774ef6c (patch)
tree	6a16460173711b72bb82245fa47ea3c3c0afbb94
parent	4ffb0e62c56b2643c4fa4c24b0585182fa803815 (diff)
download	snappy-git-27ff0af12a82c9970090bf960ff6d7863774ef6c.tar.gz