summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSnappy Team <no-reply@google.com>2020-02-07 14:38:49 +0000
committerVictor Costan <costan@google.com>2020-04-11 04:40:20 +0000
commit9eabb7baba8fb9181cffc75031b4d6c5447425a5 (patch)
tree087e83554d528913299d46fab1440ac652ee6546
parentcddd9c0875e55c9c212802b0fc8082d5b3889edd (diff)
downloadsnappy-git-9eabb7baba8fb9181cffc75031b4d6c5447425a5.tar.gz
Cut a load from the critical dependency chain of the input pointer by speculating the uncommon case of COPY_4 is not happening.
PiperOrigin-RevId: 293803653
-rw-r--r--snappy.cc51
1 files changed, 37 insertions, 14 deletions
diff --git a/snappy.cc b/snappy.cc
index 011996d..73d987e 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -76,11 +76,12 @@
namespace snappy {
+using internal::char_table;
using internal::COPY_1_BYTE_OFFSET;
using internal::COPY_2_BYTE_OFFSET;
-using internal::LITERAL;
-using internal::char_table;
+using internal::COPY_4_BYTE_OFFSET;
using internal::kMaximumTagLength;
+using internal::LITERAL;
// Any hash function will produce a valid compressed bitstream, but a good
// hash function reduces the number of collisions and thus yields better
@@ -890,18 +891,40 @@ class SnappyDecompressor {
ip += literal_length;
MAYBE_REFILL();
} else {
- const size_t entry = char_table[c];
- const size_t trailer =
- ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
- const size_t length = entry & 0xff;
- ip += entry >> 11;
-
- // copy_offset/256 is encoded in bits 8..10. By just fetching
- // those bits, we get copy_offset (since the bit-field starts at
- // bit 8).
- const size_t copy_offset = entry & 0x700;
- if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
- return;
+ // If we ignore branch prediction misses, then the throughput of the
+ // loop over tags is limited by the data dependency chain carried by
+ // ip. The amount ip advances per iteration depends on the value c.
+ // For COPY_1 it's 2 bytes, for COPY_2 it's 3 bytes and for COPY_4 it's
+ // 5 bytes. We could be branch free and use information from the entry
+ // in char_table to advance ip, but that would lead to the following
+ // critical data dependency chain
+ // ip -> c = *ip++ -> entry = char_table[c] ->
+ // advance = entry >> 11 -> ip += advance
+ // which is 5 + 5 + 1 + 1 = 12 cycles of latency. Meaning the processor
+ // will never be able to execute the loop faster then 12 cycles per
+ // iteration. It's better to speculate it's not a COPY_4, which is an
+ // uncommon tag, in which case we get the data dependency chain
+ // ip -> c = *ip++ -> c &= 3 -> ip += c
+ // which is 5 + 1 + 1 = 7 cycles of latency.
+ if (SNAPPY_PREDICT_FALSE((c & 3) == COPY_4_BYTE_OFFSET)) {
+ const size_t copy_offset = LittleEndian::Load32(ip);
+ const size_t length = (c >> 2) + 1;
+ ip += 4;
+ if (!writer->AppendFromSelf(copy_offset, length)) return;
+ } else {
+ const size_t entry = char_table[c];
+ const size_t trailer =
+ ExtractLowBytes(LittleEndian::Load32(ip), c & 3);
+ const size_t length = entry & 0xff;
+ ip += (c & 3);
+
+ // copy_offset/256 is encoded in bits 8..10. By just fetching
+ // those bits, we get copy_offset (since the bit-field starts at
+ // bit 8).
+ const size_t copy_offset = entry & 0x700;
+ if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
+ return;
+ }
}
MAYBE_REFILL();
}