summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoralkis <alkis@google.com>2018-08-28 08:47:31 -0700
committerVictor Costan <pwnall@chromium.org>2019-01-04 19:07:28 -0800
commit1b7466e14300ffa2e7e0e4d21c034f2a12048796 (patch)
tree63c1d66247979e50dc2cb36031e25c7a091fd3a4
parentea660b57d65d68d521287c903459b6dd3b2804d0 (diff)
downloadsnappy-git-1b7466e14300ffa2e7e0e4d21c034f2a12048796.tar.gz
Compute the wordmask instead of looking it up in a table.
Tested: name old speed new speed delta BM_UFlat/0 [html ] 2.13GB/s ± 0% 2.46GB/s ± 0% +15.70% (p=0.000 n=10+8) BM_UFlat/1 [urls ] 1.21GB/s ± 0% 1.20GB/s ± 0% -1.49% (p=0.000 n=9+10) BM_UFlat/2 [jpg ] 17.1GB/s ± 1% 17.2GB/s ± 1% ~ (p=0.120 n=11+11) BM_UFlat/3 [jpg_200] 1.55GB/s ± 0% 1.54GB/s ± 0% -0.96% (p=0.000 n=10+7) BM_UFlat/4 [pdf ] 12.9GB/s ± 0% 12.6GB/s ± 0% -1.98% (p=0.000 n=11+9) BM_UFlat/5 [html4 ] 1.87GB/s ± 0% 1.87GB/s ± 0% -0.06% (p=0.033 n=11+11) BM_UFlat/6 [txt1 ] 816MB/s ± 0% 793MB/s ± 0% -2.84% (p=0.000 n=11+11) BM_UFlat/7 [txt2 ] 758MB/s ± 0% 737MB/s ± 0% -2.77% (p=0.000 n=11+11) BM_UFlat/8 [txt3 ] 865MB/s ± 0% 839MB/s ± 0% -2.94% (p=0.000 n=11+8) BM_UFlat/9 [txt4 ] 701MB/s ± 0% 679MB/s ± 0% -3.11% (p=0.000 n=11+10) BM_UFlat/10 [pb ] 2.60GB/s ± 2% 3.07GB/s ± 0% +17.81% (p=0.000 n=11+11) BM_UFlat/11 [gaviota] 1.01GB/s ± 0% 0.97GB/s ± 0% -3.83% (p=0.000 n=11+10) BM_UFlat/12 [cp ] 1.66GB/s ± 1% 1.73GB/s ± 1% +4.32% (p=0.000 n=11+11) BM_UFlat/13 [c ] 1.52GB/s ± 1% 1.53GB/s ± 0% +0.49% (p=0.002 n=11+11) BM_UFlat/14 [lsp ] 1.61GB/s ± 0% 1.64GB/s ± 0% +2.10% (p=0.000 n=10+11) BM_UFlat/15 [xls ] 1.12GB/s ± 0% 1.08GB/s ± 0% -3.95% (p=0.000 n=11+7) BM_UFlat/16 [xls_200] 926MB/s ± 1% 935MB/s ± 1% ~ (p=0.056 n=9+11) BM_UFlat/17 [bin ] 1.89GB/s ± 0% 1.86GB/s ± 0% -1.32% (p=0.000 n=11+11) BM_UFlat/18 [bin_200] 1.96GB/s ± 0% 1.99GB/s ± 1% +1.78% (p=0.000 n=11+11) BM_UFlat/19 [sum ] 1.32GB/s ± 0% 1.31GB/s ± 0% -0.79% (p=0.000 n=11+10) BM_UFlat/20 [man ] 1.40GB/s ± 0% 1.43GB/s ± 0% +2.51% (p=0.000 n=9+10) BM_UValidate/0 [html ] 2.95GB/s ± 1% 3.07GB/s ± 0% +4.11% (p=0.000 n=10+11) BM_UValidate/1 [urls ] 1.57GB/s ± 0% 1.60GB/s ± 0% +2.24% (p=0.000 n=10+11) BM_UValidate/2 [jpg ] 822GB/s ± 0% 850GB/s ± 0% +3.42% (p=0.000 n=10+11) BM_UValidate/3 [jpg_200] 2.01GB/s ± 0% 2.04GB/s ± 0% +1.24% (p=0.000 n=11+11) BM_UValidate/4 [pdf ] 33.7GB/s ± 0% 35.9GB/s ± 1% +6.51% (p=0.000 n=10+11) BM_UIOVec/0 [html ] 852MB/s ± 0% 852MB/s ± 0% ~ (p=0.898 n=11+11) BM_UIOVec/1 [urls ] 663MB/s ± 0% 652MB/s ± 0% -1.61% (p=0.000 n=11+11) BM_UIOVec/2 [jpg ] 15.3GB/s ± 1% 15.3GB/s ± 2% ~ (p=0.459 n=9+10) BM_UIOVec/3 [jpg_200] 652MB/s ± 0% 627MB/s ± 1% -3.80% (p=0.000 n=10+11) BM_UIOVec/4 [pdf ] 8.80GB/s ± 1% 8.57GB/s ± 1% -2.62% (p=0.000 n=10+11) BM_UFlatSink/0 [html ] 2.13GB/s ± 0% 2.46GB/s ± 0% +15.63% (p=0.000 n=11+11) BM_UFlatSink/1 [urls ] 1.21GB/s ± 0% 1.20GB/s ± 0% -1.42% (p=0.000 n=11+10) BM_UFlatSink/2 [jpg ] 17.1GB/s ± 2% 17.2GB/s ± 1% ~ (p=0.175 n=11+9) BM_UFlatSink/3 [jpg_200] 1.52GB/s ± 1% 1.47GB/s ± 3% -3.15% (p=0.000 n=11+11) BM_UFlatSink/4 [pdf ] 12.8GB/s ± 1% 12.6GB/s ± 1% -1.76% (p=0.000 n=11+11) BM_UFlatSink/5 [html4 ] 1.87GB/s ± 0% 1.87GB/s ± 0% -0.19% (p=0.000 n=11+10) BM_UFlatSink/6 [txt1 ] 816MB/s ± 0% 792MB/s ± 0% -2.94% (p=0.000 n=11+11) BM_UFlatSink/7 [txt2 ] 758MB/s ± 0% 736MB/s ± 0% -2.83% (p=0.000 n=11+11) BM_UFlatSink/8 [txt3 ] 865MB/s ± 0% 838MB/s ± 0% -3.13% (p=0.000 n=11+11) BM_UFlatSink/9 [txt4 ] 701MB/s ± 0% 678MB/s ± 0% -3.20% (p=0.000 n=11+11) BM_UFlatSink/10 [pb ] 2.60GB/s ± 2% 3.07GB/s ± 0% +18.27% (p=0.000 n=11+10) BM_UFlatSink/11 [gaviota] 1.01GB/s ± 0% 0.97GB/s ± 0% -3.90% (p=0.000 n=11+11) BM_UFlatSink/12 [cp ] 1.66GB/s ± 1% 1.73GB/s ± 1% +4.62% (p=0.000 n=11+10) BM_UFlatSink/13 [c ] 1.52GB/s ± 0% 1.53GB/s ± 1% ~ (p=0.180 n=9+11) BM_UFlatSink/14 [lsp ] 1.61GB/s ± 0% 1.64GB/s ± 1% +1.98% (p=0.000 n=9+11) BM_UFlatSink/15 [xls ] 1.12GB/s ± 0% 1.08GB/s ± 0% -3.76% (p=0.000 n=11+11) BM_UFlatSink/16 [xls_200] 909MB/s ± 2% 924MB/s ± 1% +1.62% (p=0.000 n=11+11) BM_UFlatSink/17 [bin ] 1.88GB/s ± 0% 1.86GB/s ± 0% -1.18% (p=0.000 n=9+11) BM_UFlatSink/18 [bin_200] 1.94GB/s ± 2% 1.94GB/s ± 1% ~ (p=0.090 n=11+11) BM_UFlatSink/19 [sum ] 1.32GB/s ± 0% 1.31GB/s ± 0% -0.76% (p=0.000 n=11+11) BM_UFlatSink/20 [man ] 1.39GB/s ± 2% 1.43GB/s ± 0% +2.75% (p=0.000 n=11+10) Assembly before: * 44 8b 5c 85 a0 mov -0x60(%rbp,%rax,4),%r11d 45 23 5d 00 and 0x0(%r13),%r11d 89 d6 mov %edx,%esi 81 e6 00 07 00 00 and $0x700,%esi Assembly after: * 89 c1 mov %eax,%ecx * c0 e1 03 shl $0x3,%cl * bf ff ff ff ff mov $0xffffffff,%edi * 48 d3 e7 shl %cl,%rdi * f7 d7 not %edi 41 23 7d 00 and 0x0(%r13),%edi 41 89 d3 mov %edx,%r11d 41 81 e3 00 07 00 00 and $0x700,%r11d
-rw-r--r--snappy.cc35
1 files changed, 11 insertions, 24 deletions
diff --git a/snappy.cc b/snappy.cc
index e594bb9..5352c24 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -675,14 +675,15 @@ static inline void Report(const char *algorithm, size_t compressed_size,
// bool TryFastAppend(const char* ip, size_t available, size_t length);
// };
-namespace internal {
-
-// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
-static const uint32 wordmask[] = {
- 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
-};
-
-} // end namespace internal
+// Mapping from n in range [0,4] to a mask to extract the bottom 8*n bits.
+static inline uint32 WordMask(int n) {
+ DCHECK_GE(n, 0);
+ DCHECK_LE(n, 4);
+ // This needs to be wider than uint32 otherwise `mask << 32` will be
+ // undefined.
+ uint64 mask = 0xffffffff;
+ return ~(mask << (8 * n));
+}
// Helper class for decompression
class SnappyDecompressor {
@@ -770,20 +771,6 @@ class SnappyDecompressor {
#endif
const char* ip = ip_;
- // For position-independent executables, accessing global arrays can be
- // slow. Move wordmask array onto the stack to mitigate this.
- uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
- // Do not use memcpy to copy internal::wordmask to
- // wordmask. LLVM converts stack arrays to global arrays if it detects
- // const stack arrays and this hurts the performance of position
- // independent code. This change is temporary and can be reverted when
- // https://reviews.llvm.org/D30759 is approved.
- wordmask[0] = internal::wordmask[0];
- wordmask[1] = internal::wordmask[1];
- wordmask[2] = internal::wordmask[2];
- wordmask[3] = internal::wordmask[3];
- wordmask[4] = internal::wordmask[4];
-
// We could have put this refill fragment only at the beginning of the loop.
// However, duplicating it at the end of each branch gives the compiler more
// scope to optimize the <ip_limit_ - ip> expression based on the local
@@ -825,7 +812,7 @@ class SnappyDecompressor {
// Long literal.
const size_t literal_length_length = literal_length - 60;
literal_length =
- (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
+ (LittleEndian::Load32(ip) & WordMask(literal_length_length)) + 1;
ip += literal_length_length;
}
@@ -848,7 +835,7 @@ class SnappyDecompressor {
MAYBE_REFILL();
} else {
const size_t entry = char_table[c];
- const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
+ const size_t trailer = LittleEndian::Load32(ip) & WordMask(entry >> 11);
const size_t length = entry & 0xff;
ip += entry >> 11;