Speed up decompression by making the fast path for literals faster.

We do the fast-path step as soon as possible; in fact, as soon as we know the literal length. Since we usually hit the fast path, we can then skip the checks for long literals and available input space (beyond what the fast path check already does). Note that this changes the decompression Writer API; however, it does not change the ABI, since writers are always templatized and as such never cross compilation units. The new API is slightly more general, in that it doesn't hard-code the value 16. Note that we also take care to check for len <= 16 first, since the other two checks almost always succeed (so we don't want to waste time checking for them until we have to). The improvements are most marked on Nehalem, but are generally positive on other platforms as well. All microbenchmarks are 64-bit, opt. Clovertown (Core 2): Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------- BM_UFlat/0 110226 110224 100000 886.0MB/s html [ +1.5%] BM_UFlat/1 1036523 1036508 10000 646.0MB/s urls [ -0.8%] BM_UFlat/2 26775 26775 522570 4.4GB/s jpg [ +0.0%] BM_UFlat/3 49738 49737 280974 1.8GB/s pdf [ +0.3%] BM_UFlat/4 446790 446792 31334 874.3MB/s html4 [ +0.8%] BM_UFlat/5 40561 40562 350424 578.5MB/s cp [ +1.3%] BM_UFlat/6 18722 18722 746903 568.0MB/s c [ +1.4%] BM_UFlat/7 5373 5373 2608632 660.5MB/s lsp [ +8.3%] BM_UFlat/8 1615716 1615718 8670 607.8MB/s xls [ +2.0%] BM_UFlat/9 345278 345281 40481 420.1MB/s txt1 [ +1.4%] BM_UFlat/10 294855 294855 47452 404.9MB/s txt2 [ +1.6%] BM_UFlat/11 914263 914263 15316 445.2MB/s txt3 [ +1.1%] BM_UFlat/12 1222694 1222691 10000 375.8MB/s txt4 [ +1.4%] BM_UFlat/13 584495 584489 23954 837.4MB/s bin [ -0.6%] BM_UFlat/14 66662 66662 210123 547.1MB/s sum [ +1.2%] BM_UFlat/15 7368 7368 1881856 547.1MB/s man [ +4.0%] BM_UFlat/16 110727 110726 100000 1021.4MB/s pb [ +2.3%] BM_UFlat/17 382138 382141 36616 460.0MB/s gaviota [ -0.7%] Westmere (Core i7): Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------- BM_UFlat/0 78861 78853 177703 1.2GB/s html [ +2.1%] BM_UFlat/1 739560 739491 18912 905.4MB/s urls [ +3.4%] BM_UFlat/2 9867 9866 1419014 12.0GB/s jpg [ +3.4%] BM_UFlat/3 31989 31986 438385 2.7GB/s pdf [ +0.2%] BM_UFlat/4 319406 319380 43771 1.2GB/s html4 [ +1.9%] BM_UFlat/5 29639 29636 472862 791.7MB/s cp [ +5.2%] BM_UFlat/6 13478 13477 1000000 789.0MB/s c [ +2.3%] BM_UFlat/7 4030 4029 3475364 880.7MB/s lsp [ +8.7%] BM_UFlat/8 1036585 1036492 10000 947.5MB/s xls [ +6.9%] BM_UFlat/9 242127 242105 57838 599.1MB/s txt1 [ +3.0%] BM_UFlat/10 206499 206480 67595 578.2MB/s txt2 [ +3.4%] BM_UFlat/11 641635 641570 21811 634.4MB/s txt3 [ +2.4%] BM_UFlat/12 848847 848769 16443 541.4MB/s txt4 [ +3.1%] BM_UFlat/13 384968 384938 36366 1.2GB/s bin [ +0.3%] BM_UFlat/14 47106 47101 297770 774.3MB/s sum [ +4.4%] BM_UFlat/15 5063 5063 2772202 796.2MB/s man [ +7.7%] BM_UFlat/16 83663 83656 167697 1.3GB/s pb [ +1.8%] BM_UFlat/17 260224 260198 53823 675.6MB/s gaviota [ -0.5%] Barcelona (Opteron): Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------- BM_UFlat/0 112490 112457 100000 868.4MB/s html [ -0.4%] BM_UFlat/1 1066719 1066339 10000 627.9MB/s urls [ +1.0%] BM_UFlat/2 24679 24672 563802 4.8GB/s jpg [ +0.7%] BM_UFlat/3 50603 50589 277285 1.7GB/s pdf [ +2.6%] BM_UFlat/4 452982 452849 30900 862.6MB/s html4 [ -0.2%] BM_UFlat/5 43860 43848 319554 535.1MB/s cp [ +1.2%] BM_UFlat/6 21419 21413 653573 496.6MB/s c [ +1.0%] BM_UFlat/7 6646 6645 2105405 534.1MB/s lsp [ +0.3%] BM_UFlat/8 1828487 1827886 7658 537.3MB/s xls [ +2.6%] BM_UFlat/9 391824 391714 35708 370.3MB/s txt1 [ +2.2%] BM_UFlat/10 334913 334816 41885 356.6MB/s txt2 [ +1.7%] BM_UFlat/11 1042062 1041674 10000 390.7MB/s txt3 [ +1.1%] BM_UFlat/12 1398902 1398456 10000 328.6MB/s txt4 [ +1.7%] BM_UFlat/13 545706 545530 25669 897.2MB/s bin [ -0.4%] BM_UFlat/14 71512 71505 196035 510.0MB/s sum [ +1.4%] BM_UFlat/15 8422 8421 1665036 478.7MB/s man [ +2.6%] BM_UFlat/16 112053 112048 100000 1009.3MB/s pb [ -0.4%] BM_UFlat/17 416723 416713 33612 421.8MB/s gaviota [ -2.0%] R=sanjay git-svn-id: http://snappy.googlecode.com/svn/trunk@53 03e5f5b5-db94-4691-08a0-1a8bf15f6143
author: snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143> 2011-11-23 11:14:17 +0000
committer: snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143> 2011-11-23 11:14:17 +0000
commit: dc9e55f2949e4518742d029c724f416abae3f5a5 (patch)
tree: 96a0de265647fab0d705ec603c054a7454f9ac12
parent: b60a5ced174a8c571654ad6eb05ae9da8854b111 (diff)
download: snappy-dc9e55f2949e4518742d029c724f416abae3f5a5.tar.gz
1 files changed, 50 insertions, 22 deletions
diff --git a/snappy.cc b/snappy.cc
index c79edb5..799a640 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -435,12 +435,26 @@ char* CompressFragment(const char* input,
 //   bool CheckLength() const;
 //
 //   // Called repeatedly during decompression
-//   bool Append(const char* ip, uint32 length, bool allow_fast_path);
+//   bool Append(const char* ip, uint32 length);
 //   bool AppendFromSelf(uint32 offset, uint32 length);
-// };
 //
-// "allow_fast_path" is a parameter that says if there is at least 16
-// readable bytes in "ip". It is currently only used by SnappyArrayWriter.
+//   // The difference between TryFastAppend and Append is that TryFastAppend
+//   // is allowed to read up to <available> bytes from the input buffer,
+//   // whereas Append is allowed to read <length>.
+//   //
+//   // Also, TryFastAppend is allowed to return false, declining the append,
+//   // without it being a fatal error -- just "return false" would be
+//   // a perfectly legal implementation of TryFastAppend. The intention
+//   // is for TryFastAppend to allow a fast path in the common case of
+//   // a small append.
+//   //
+//   // NOTE(user): TryFastAppend must always return decline (return false)
+//   // if <length> is 61 or more, as in this case the literal length is not
+//   // decoded fully. In practice, this should not be a big problem,
+//   // as it is unlikely that one would implement a fast path accepting
+//   // this much data.
+//   bool TryFastAppend(const char* ip, uint32 available, uint32 length);
+// };
 
 // -----------------------------------------------------------------------
 // Lookup table for decompression code.  Generated by ComputeTable() below.
@@ -665,19 +679,23 @@ class SnappyDecompressor {
       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
 
       if ((c & 0x3) == LITERAL) {
-        uint32 literal_length = c >> 2;
-        if (PREDICT_FALSE(literal_length >= 60)) {
+        uint32 literal_length = (c >> 2) + 1;
+        if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
+          DCHECK_LT(literal_length, 61);
+          ip += literal_length;
+          continue;
+        }
+        if (PREDICT_FALSE(literal_length >= 61)) {
           // Long literal.
-          const uint32 literal_length_length = literal_length - 59;
+          const uint32 literal_length_length = literal_length - 60;
           literal_length =
-              LittleEndian::Load32(ip) & wordmask[literal_length_length];
+              (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
           ip += literal_length_length;
         }
-        ++literal_length;
 
         uint32 avail = ip_limit_ - ip;
         while (avail < literal_length) {
-          if (!writer->Append(ip, avail, false)) return;
+          if (!writer->Append(ip, avail)) return;
           literal_length -= avail;
           reader_->Skip(peeked_);
           size_t n;
@@ -687,8 +705,7 @@ class SnappyDecompressor {
           if (avail == 0) return;  // Premature end of input
           ip_limit_ = ip + avail;
         }
-        bool allow_fast_path = (avail >= 16);
-        if (!writer->Append(ip, literal_length, allow_fast_path)) {
+        if (!writer->Append(ip, literal_length)) {
           return;
         }
         ip += literal_length;
@@ -902,21 +919,29 @@ class SnappyArrayWriter {
     return op_ == op_limit_;
   }
 
-  inline bool Append(const char* ip, uint32 len, bool allow_fast_path) {
+  inline bool Append(const char* ip, uint32 len) {
     char* op = op_;
     const int space_left = op_limit_ - op;
-    if (allow_fast_path && len <= 16 && space_left >= 16) {
-      // Fast path, used for the majority (about 90%) of dynamic invocations.
+    if (space_left < len) {
+      return false;
+    }
+    memcpy(op, ip, len);
+    op_ = op + len;
+    return true;
+  }
+
+  inline bool TryFastAppend(const char* ip, uint32 available, uint32 len) {
+    char* op = op_;
+    const int space_left = op_limit_ - op;
+    if (len <= 16 && available >= 16 && space_left >= 16) {
+      // Fast path, used for the majority (about 95%) of invocations.
       UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
       UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+      op_ = op + len;
+      return true;
     } else {
-      if (space_left < len) {
-        return false;
-      }
-      memcpy(op, ip, len);
+      return false;
     }
-    op_ = op + len;
-    return true;
   }
 
   inline bool AppendFromSelf(uint32 offset, uint32 len) {
@@ -985,10 +1010,13 @@ class SnappyDecompressionValidator {
   inline bool CheckLength() const {
     return expected_ == produced_;
   }
-  inline bool Append(const char* ip, uint32 len, bool allow_fast_path) {
+  inline bool Append(const char* ip, uint32 len) {
     produced_ += len;
     return produced_ <= expected_;
   }
+  inline bool TryFastAppend(const char* ip, uint32 available, uint32 length) {
+    return false;
+  }
   inline bool AppendFromSelf(uint32 offset, uint32 len) {
     if (produced_ <= offset - 1u) return false;  // -1u catches offset==0
     produced_ += len;
author	snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>	2011-11-23 11:14:17 +0000
committer	snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>	2011-11-23 11:14:17 +0000
commit	dc9e55f2949e4518742d029c724f416abae3f5a5 (patch)
tree	96a0de265647fab0d705ec603c054a7454f9ac12
parent	b60a5ced174a8c571654ad6eb05ae9da8854b111 (diff)
download	snappy-dc9e55f2949e4518742d029c724f416abae3f5a5.tar.gz