2 files changed, 188 insertions, 126 deletions
diff --git a/snappy.cc b/snappy.cc
index 7578901..8449b6e 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -70,6 +70,7 @@
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
@@ -79,6 +80,8 @@
 
 namespace snappy {
 
+namespace {
+
 // The amount of slop bytes writers are using for unconditional copies.
 constexpr int kSlopBytes = 64;
 
@@ -90,25 +93,30 @@ using internal::kMaximumTagLength;
 using internal::LITERAL;
 
 // We translate the information encoded in a tag through a lookup table to a
-// format that requires fewer instructions to decode.
-// The returned format encodes the offset in the high byte and the length
-// in the low byte. Because length will never be 0, we use zero as an indicator
-// for an exceptional value (copy 3 tag or a literal > 60 bytes).
-constexpr size_t kLiteralOffset = 256;
-inline constexpr uint16_t MakeEntry(uint16_t len, uint16_t offset) {
-  return len | (offset << 8);
+// format that requires fewer instructions to decode. Effectively we store
+// the length minus the tag part of the offset. The lowest significant byte
+// thus stores the length. While total length - offset is given by
+// entry - ExtractOffset(type). The nice thing is that the subtraction
+// immediately sets the flags for the necessary check that offset >= length.
+// This folds the cmp with sub. We engineer the long literals and copy-4 to
+// always fail this check, so their presence doesn't affect the fast path.
+// To prevent literals from triggering the guard against offset < length (offset
+// does not apply to literals) the table is giving them a spurious offset of
+// 256.
+inline constexpr int16_t MakeEntry(int16_t len, int16_t offset) {
+  return len - (offset << 8);
 }
 
-inline constexpr uint16_t OffsetAndLength(uint8_t tag, int type) {
-  return type == 3      ? 0                             // copy-4 (or type == 3)
-         : type == 2    ? MakeEntry((tag >> 2) + 1, 0)  // copy-2
-         : type == 1    ? MakeEntry(((tag >> 2) & 7) + 4, tag >> 5)  // copy-1
-         : tag < 60 * 4 ? MakeEntry((tag >> 2) + 1, 1)
-                        : 0;  // long literal
+inline constexpr int16_t LengthMinusOffset(int data, int type) {
+  return type == 3   ? 0xFF                    // copy-4 (or type == 3)
+         : type == 2 ? MakeEntry(data + 1, 0)  // copy-2
+         : type == 1 ? MakeEntry((data & 7) + 4, data >> 3)  // copy-1
+         : data < 60 ? MakeEntry(data + 1, 1)  // note spurious offset.
+                     : 0xFF;                   // long literal
 }
 
-inline constexpr uint16_t OffsetAndLength(uint8_t tag) {
-  return OffsetAndLength(tag, tag & 3);
+inline constexpr int16_t LengthMinusOffset(uint8_t tag) {
+  return LengthMinusOffset(tag >> 2, tag & 3);
 }
 
 template <size_t... Ints>
@@ -121,23 +129,29 @@ template <size_t... Is>
 struct make_index_sequence<0, Is...> : index_sequence<Is...> {};
 
 template <size_t... seq>
-constexpr std::array<uint16_t, 256> MakeTable(index_sequence<seq...>) {
-  return std::array<uint16_t, 256>{OffsetAndLength(seq)...};
+constexpr std::array<int16_t, 256> MakeTable(index_sequence<seq...>) {
+  return std::array<int16_t, 256>{LengthMinusOffset(seq)...};
 }
 
-alignas(64) constexpr std::array<uint16_t, 256> offset_and_length_table =
-    MakeTable(make_index_sequence<256>{});
+// We maximally co-locate the two tables so that only one register needs to be
+// reserved for the table address.
+struct {
+  alignas(64) const std::array<int16_t, 256> length_minus_offset;
+  uint32_t extract_masks[4];  // Used for extracting offset based on tag type.
+} table = {MakeTable(make_index_sequence<256>{}), {0, 0xFF, 0xFFFF, 0}};
 
 // Any hash function will produce a valid compressed bitstream, but a good
 // hash function reduces the number of collisions and thus yields better
 // compression for compressible input, and more speed for incompressible
 // input. Of course, it doesn't hurt if the hash function is reasonably fast
 // either, as it gets called a lot.
-static inline uint32_t HashBytes(uint32_t bytes, uint32_t mask) {
+inline uint32_t HashBytes(uint32_t bytes, uint32_t mask) {
   constexpr uint32_t kMagic = 0x1e35a7bd;
   return ((kMagic * bytes) >> (32 - kMaxHashTableBits)) & mask;
 }
 
+}  // namespace
+
 size_t MaxCompressedLength(size_t source_bytes) {
   // Compressed data can be defined as:
   //    compressed := item* literal*
@@ -967,6 +981,72 @@ static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) {
   return (value & masks[shift]) != 0;
 }
 
+inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) {
+  return offset != 0;
+}
+
+void MemCopy(char* dst, const uint8_t* src, size_t size) {
+  std::memcpy(dst, src, size);
+}
+
+void MemCopy(ptrdiff_t dst, const uint8_t* src, size_t size) {}
+
+void MemMove(char* dst, const void* src, size_t size) {
+  std::memmove(dst, src, size);
+}
+
+void MemMove(ptrdiff_t dst, const void* src, size_t size) {}
+
+SNAPPY_ATTRIBUTE_ALWAYS_INLINE
+size_t AdvanceToNextTag(const uint8_t** ip_p, size_t* tag) {
+  const uint8_t*& ip = *ip_p;
+  // This section is crucial for the throughput of the decompression loop.
+  // The latency of an iteration is fundamentally constrained by the
+  // following data chain on ip.
+  // ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2
+  //                       ip2 = ip + 2 + (c >> 2)
+  // This amounts to 8 cycles.
+  // 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov)
+  size_t literal_len = *tag >> 2;
+  size_t tag_type = *tag;
+  bool is_literal;
+#if defined(__GNUC__) && defined(__x86_64__)
+  // TODO clang misses the fact that the (c & 3) already correctly
+  // sets the zero flag.
+  asm("and $3, %k[tag_type]\n\t"
+      : [tag_type] "+r"(tag_type), "=@ccz"(is_literal));
+#else
+  tag_type &= 3;
+  is_literal = (tag_type == 0);
+#endif
+  // TODO
+  // This is code is subtle. Loading the values first and then cmov has less
+  // latency then cmov ip and then load. However clang would move the loads
+  // in an optimization phase, volatile prevents this transformation.
+  // Note that we have enough slop bytes (64) that the loads are always valid.
+  size_t tag_literal =
+      static_cast<const volatile uint8_t*>(ip)[1 + literal_len];
+  size_t tag_copy = static_cast<const volatile uint8_t*>(ip)[tag_type];
+  *tag = is_literal ? tag_literal : tag_copy;
+  const uint8_t* ip_copy = ip + 1 + tag_type;
+  const uint8_t* ip_literal = ip + 2 + literal_len;
+  ip = is_literal ? ip_literal : ip_copy;
+#if defined(__GNUC__) && defined(__x86_64__)
+  // TODO Clang is "optimizing" zero-extension (a totally free
+  // operation) this means that after the cmov of tag, it emits another movzb
+  // tag, byte(tag). It really matters as it's on the core chain. This dummy
+  // asm, persuades clang to do the zero-extension at the load (it's automatic)
+  // removing the expensive movzb.
+  asm("" ::"r"(tag_copy));
+#endif
+  return tag_type;
+}
+
+// Extract the offset for copy-1 and copy-2 returns 0 for literals or copy-4.
+inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) {
+  return val & table.extract_masks[tag_type];
+};
+
 // Core decompression loop, when there is enough data available.
 // Decompresses the input buffer [ip, ip_limit) into the output buffer
 // [op, op_limit_min_slop). Returning when either we are too close to the end
@@ -976,113 +1056,92 @@ static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) {
 // TODO This function probably does not need to be inlined, as it
 // should decode large chunks at a time. This allows runtime dispatch to
 // implementations based on CPU capability (BMI2 / perhaps 32 / 64 byte memcpy).
-std::pair<const uint8_t*, char*> DecompressBranchless(
-    const uint8_t* ip, const uint8_t* ip_limit, char* op_ptr, char* op_base,
-    char* op_limit_min_slop_ptr) {
-  std::ptrdiff_t op = op_ptr - op_base;
-  std::ptrdiff_t op_limit_min_slop = op_limit_min_slop_ptr - op_base;
-  std::ptrdiff_t op_limit = op_limit_min_slop + kSlopBytes - 1;
-  if (kSlopBytes < ip_limit - ip && op < op_limit_min_slop) {
-    const uint8_t* const ip_limit_min_slop = ip_limit - kSlopBytes + 1;
+template <typename T>
+std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
+    const uint8_t* ip, const uint8_t* ip_limit, ptrdiff_t op, T op_base,
+    ptrdiff_t op_limit_min_slop) {
+  // We unroll the inner loop twice so we need twice the spare room.
+  op_limit_min_slop -= kSlopBytes;
+  if (2 * (kSlopBytes + 1) < ip_limit - ip && op < op_limit_min_slop) {
+    const uint8_t* const ip_limit_min_slop = ip_limit - 2 * kSlopBytes - 1;
     ip++;
     // ip points just past the tag and we are touching at maximum kSlopBytes
     // in an iteration.
+    size_t tag = ip[-1];
     do {
-      const uint8_t* old_ip = ip;
-      const uint64_t tag = ip[-1];
-      uint32_t offset_and_length = offset_and_length_table[tag];
-      if (SNAPPY_PREDICT_FALSE(offset_and_length == 0)) {
-        // Exception case (long literal or copy 4).
-        if ((tag & 3) == 3) {
-        break_loop:
-          ip = old_ip;
-          break;
+      // The throughput is limited by instructions, unrolling the inner loop
+      // twice reduces the amount of instructions checking limits and also
+      // leads to reduced mov's.
+      for (int i = 0; i < 2; i++) {
+        const uint8_t* old_ip = ip;
+        assert(tag == ip[-1]);
+        // For literals tag_type = 0, hence we will always obtain 0 from
+        // ExtractLowBytes. For literals offset will thus be kLiteralOffset.
+        ptrdiff_t len_min_offset = table.length_minus_offset[tag];
+        size_t tag_type = AdvanceToNextTag(&ip, &tag);
+        uint32_t next = LittleEndian::Load32(old_ip);
+        size_t len = len_min_offset & 0xFF;
+        len_min_offset -= ExtractOffset(next, tag_type);
+        if (SNAPPY_PREDICT_FALSE(len_min_offset > 0)) {
+          if (SNAPPY_PREDICT_FALSE(len & 0x80)) {
+            // Exceptional case (long literal or copy 4).
+            // Actually doing the copy here is negatively impacting the main
+            // loop due to compiler incorrectly allocating a register for
+            // this fallback. Hence we just break.
+          break_loop:
+            ip = old_ip;
+            goto exit;
+          }
+          // Only copy-1 or copy-2 tags can get here.
+          assert(tag_type == 1 || tag_type == 2);
+          std::ptrdiff_t delta = op + len_min_offset - len;
+          // Guard against copies before the buffer start.
+          if (SNAPPY_PREDICT_FALSE(delta < 0 ||
+                                  !Copy64BytesWithPatternExtension(
+                                      op_base + op, len - len_min_offset))) {
+            goto break_loop;
+          }
+          op += len;
+          continue;
         }
-        assert((tag & 3) == 0);  // This is a literal
-        assert(tag >= 60 * 4);   // It's a long literal
-        uint32_t next = LittleEndian::Load32(ip);
-        uint32_t length_bytes = (tag >> 2) - 59;
-        uint32_t literal_len = ExtractLowBytes(next, length_bytes) + 1;
-        ip += length_bytes;
-        // Note we use >= instead of > because we also need to increment ip
-        // because we have to leave it past the tag.
-        if (literal_len >= ip_limit - ip) goto break_loop;
-        if (literal_len > op_limit - op) goto break_loop;
-        memcpy(op_base + op, ip, literal_len);
-        ip += literal_len;
-        op += literal_len;
-        assert(ip < ip_limit);  // See above test
-        ip++;
-        continue;
-      }
-      size_t tag_type;
-      {
-        // This section is crucial for the throughput of the decompression loop.
-        // The latency of an iteration is fundamentally constrained by the
-        // following data chain on ip.
-        // ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2
-        //                       ip2 = ip + 2 + (c >> 2)
-        // This amounts to 8 cycles.
-        // 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov)
-        size_t literal_len = tag >> 2;
+        std::ptrdiff_t delta = op + len_min_offset - len;
+        if (SNAPPY_PREDICT_FALSE(delta < 0)) {
 #if defined(__GNUC__) && defined(__x86_64__)
-        // TODO
-        // clang misses the fact that the (c & 3) already correctly sets
-        // the zero flag.
-        tag_type = tag;
-        bool is_literal;
-        asm("and $3, %0\n\t" : "+r"(tag_type), "=@ccz"(is_literal));
-        bool is_copy = !is_literal;
-#else
-        tag_type = tag & 3;
-        bool is_copy = (tag_type != 0);
+          // TODO
+          // When validating, both code path reduced to `op += len`. Ie. this
+          // becomes effectively
+          //
+          // if (delta < 0) if (tag_type != 0) goto break_loop;
+          // op += len;
+          //
+          // The compiler interchanges the predictable and almost always false
+          // first if-statement with the completely unpredictable second
+          // if-statement, putting an unpredictable branch on every iteration.
+          // This empty asm is worth almost 2x, which I think qualifies for an
+          // award for the most load-bearing empty statement.
+          asm("");
 #endif
-        const uint8_t* ip_copy = ip + 1 + tag_type;
-        const uint8_t* ip_literal = ip + 2 + literal_len;
-        ip = is_copy ? ip_copy : ip_literal;
-      }
-      uint32_t next = LittleEndian::Load32(old_ip);
-      // For literals tag_type = 0, hence we will always obtain 0 from
-      // ExtractLowBytes. For literals offset will thus be kLiteralOffset.
-      std::ptrdiff_t offset =
-          (offset_and_length & 0x700) + ExtractLowBytes(next, tag_type);
-      size_t len = offset_and_length & 0xFF;
-      std::ptrdiff_t delta = op - offset;
-      if (SNAPPY_PREDICT_FALSE(delta < 0)) {
-        if (tag_type != 0) goto break_loop;
-        std::memcpy(op_base + op, old_ip, 64);
-        op += len;
-        continue;
-      }
-      // By choosing literals to have kLiteralOffset this test will
-      // always succeed for literals.
-      if (SNAPPY_PREDICT_FALSE(std::size_t(offset) < len)) {
-        assert(tag_type != 0);
-        // offset 0 is an error.
-        if (!Copy64BytesWithPatternExtension(op_base + op, offset)) {
-          break;
+
+          // Due to the spurious offset in literals have this will trigger
+          // at the start of a block when op is still smaller than 256.
+          if (tag_type != 0) goto break_loop;
+          MemCopy(op_base + op, old_ip, 64);
+          op += len;
+          continue;
         }
+
+        // For copies we need to copy from op_base + delta, for literals
+        // we need to copy from ip instead of from the stream.
+        const void* from =
+            tag_type ? reinterpret_cast<void*>(op_base + delta) : old_ip;
+        MemMove(op_base + op, from, 64);
         op += len;
-        continue;
       }
-
-      const uint8_t* from =
-          tag_type ? reinterpret_cast<const uint8_t*>(op_base + delta) : old_ip;
-      // For literals we need to copy from ip instead of from the stream.
-      // We also need to compensate for the offset in that case.
-      std::memmove(op_base + op, from, 64);
-      op += len;
     } while (ip < ip_limit_min_slop && op < op_limit_min_slop);
+  exit:
     ip--;
     assert(ip <= ip_limit);
   }
-  return {ip, op_base + op};
-}
-
-template <typename T>
-std::pair<const uint8_t*, T> DecompressBranchless(const uint8_t* ip,
-                                                  const uint8_t*, T op, char*,
-                                                  char*) {
   return {ip, op};
 }
 
@@ -1179,15 +1238,15 @@ class SnappyDecompressor {
     MAYBE_REFILL();
     for (;;) {
       {
-        char* op_limit_min_slop;
+        ptrdiff_t op_limit_min_slop;
         auto op_base = writer->GetBase(&op_limit_min_slop);
         if (op_base) {
           auto res =
               DecompressBranchless(reinterpret_cast<const uint8_t*>(ip),
                                    reinterpret_cast<const uint8_t*>(ip_limit_),
-                                   op, op_base, op_limit_min_slop);
+                                   op - op_base, op_base, op_limit_min_slop);
           ip = reinterpret_cast<const char*>(res.first);
-          op = res.second;
+          op = op_base + res.second;
           MAYBE_REFILL();
         }
       }
@@ -1250,7 +1309,7 @@ class SnappyDecompressor {
 
           if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;
         } else {
-          const uint32_t entry = offset_and_length_table[c];
+          const ptrdiff_t entry = table.length_minus_offset[c];
           preload = LittleEndian::Load32(ip);
           const uint32_t trailer = ExtractLowBytes(preload, c & 3);
           const uint32_t length = entry & 0xff;
@@ -1259,7 +1318,7 @@ class SnappyDecompressor {
           // copy_offset/256 is encoded in bits 8..10.  By just fetching
           // those bits, we get copy_offset (since the bit-field starts at
           // bit 8).
-          const uint32_t copy_offset = (entry & 0x700) + trailer;
+          const uint32_t copy_offset = trailer - entry + length;
           if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;
 
           ip += (c & 3);
@@ -1523,7 +1582,7 @@ class SnappyIOVecWriter {
   }
 
   char* GetOutputPtr() { return nullptr; }
-  char* GetBase(char**) { return nullptr; }
+  char* GetBase(ptrdiff_t*) { return nullptr; }
   void SetOutputPtr(char* op) {
     // TODO: Switch to [[maybe_unused]] when we can assume C++17.
     (void)op;
@@ -1688,8 +1747,8 @@ class SnappyArrayWriter {
   inline bool CheckLength() const { return op_ == op_limit_; }
 
   char* GetOutputPtr() { return op_; }
-  char* GetBase(char** op_limit_min_slop) {
-    *op_limit_min_slop = op_limit_min_slop_;
+  char* GetBase(ptrdiff_t* op_limit_min_slop) {
+    *op_limit_min_slop = op_limit_min_slop_ - base_;
     return base_;
   }
   void SetOutputPtr(char* op) { op_ = op; }
@@ -1782,7 +1841,10 @@ class SnappyDecompressionValidator {
   inline SnappyDecompressionValidator() : expected_(0), produced_(0) {}
   inline void SetExpectedLength(size_t len) { expected_ = len; }
   size_t GetOutputPtr() { return produced_; }
-  char* GetBase(char**) { return nullptr; }
+  size_t GetBase(ptrdiff_t* op_limit_min_slop) {
+    *op_limit_min_slop = std::numeric_limits<ptrdiff_t>::max() - kSlopBytes + 1;
+    return 1;
+  }
   void SetOutputPtr(size_t op) { produced_ = op; }
   inline bool CheckLength() const { return expected_ == produced_; }
   inline bool Append(const char* ip, size_t len, size_t* produced) {
@@ -1887,8 +1949,8 @@ class SnappyScatteredWriter {
         op_limit_(NULL),
         op_limit_min_slop_(NULL) {}
   char* GetOutputPtr() { return op_ptr_; }
-  char* GetBase(char** op_limit_min_slop) {
-    *op_limit_min_slop = op_limit_min_slop_;
+  char* GetBase(ptrdiff_t* op_limit_min_slop) {
+    *op_limit_min_slop = op_limit_min_slop_ - op_base_;
     return op_base_;
   }
   void SetOutputPtr(char* op) { op_ptr_ = op; }
diff --git a/snappy_unittest.cc b/snappy_unittest.cc
index 0b568b2..7b220d1 100644
--- a/snappy_unittest.cc
+++ b/snappy_unittest.cc
@@ -1368,7 +1368,7 @@ struct SourceFiles {
   size_t max_size = 0;
 };
 
-static void BM_UFlatMedley(int iters, int) {
+static void BM_UFlatMedley(int iters) {
   static const SourceFiles* const source = new SourceFiles();
 
   std::vector<char> dst(source->max_size);
@@ -1408,7 +1408,7 @@ static void BM_UValidate(int iters, int arg) {
 }
 BENCHMARK(BM_UValidate)->DenseRange(0, ARRAYSIZE(files) - 1);
 
-static void BM_UValidateMedley(int iters, int) {
+static void BM_UValidateMedley(int iters) {
   static const SourceFiles* const source = new SourceFiles();
 
   size_t processed = 0;