summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcostan <costan@google.com>2019-01-08 11:31:10 -0800
committerVictor Costan <pwnall@chromium.org>2019-01-08 13:48:30 -0800
commit97a20b480fb35152d6f6f5c5d6bd66a218ba48e4 (patch)
tree0f3114caa9fb3f73cb7e5f1f9734911e2e66b5c0
parent4f0adca400ed519c463f8bbc3031296e38dd746d (diff)
downloadsnappy-git-97a20b480fb35152d6f6f5c5d6bd66a218ba48e4.tar.gz
Reduce the LeftShiftOverflows() table size.
A previous CL introduced LeftShiftOverflows(), which takes a uint32 input. However, the value it operates on is guaranteed to only have 8 bits set. This CL takes advantage of this restriction to reduce the size of the static table used to compute LeftShiftOverflows(). The same methodology as the previous CL suggests a 0.6% improvement. The improvement is likely bigger on mobile CPUs that have much smaller caches. Benchmark results: name old time/op new time/op delta BM_UFlat/0 [html ] 42.5µs ± 1% 42.1µs ± 0% -0.87% (p=0.000 n=20+20) BM_UFlat/1 [urls ] 575µs ± 0% 574µs ± 0% -0.16% (p=0.000 n=20+19) BM_UFlat/2 [jpg ] 7.13µs ± 1% 7.20µs ± 5% ~ (p=0.422 n=16+19) BM_UFlat/3 [jpg_200 ] 129ns ± 0% 130ns ± 0% +0.82% (p=0.000 n=20+17) BM_UFlat/4 [pdf ] 8.22µs ± 1% 8.21µs ± 0% ~ (p=0.586 n=17+17) BM_UFlat/5 [html4 ] 222µs ± 0% 222µs ± 0% -0.11% (p=0.047 n=19+20) BM_UFlat/6 [txt1 ] 192µs ± 0% 191µs ± 0% -0.69% (p=0.000 n=20+20) BM_UFlat/7 [txt2 ] 169µs ± 0% 169µs ± 0% -0.28% (p=0.000 n=20+20) BM_UFlat/8 [txt3 ] 510µs ± 0% 507µs ± 0% -0.50% (p=0.000 n=20+20) BM_UFlat/9 [txt4 ] 707µs ± 0% 703µs ± 0% -0.53% (p=0.000 n=20+20) BM_UFlat/10 [pb ] 39.1µs ± 0% 38.5µs ± 0% -1.56% (p=0.000 n=20+20) BM_UFlat/11 [gaviota ] 189µs ± 0% 189µs ± 0% -0.42% (p=0.000 n=20+20) BM_UFlat/12 [cp ] 14.2µs ± 0% 14.2µs ± 1% -0.30% (p=0.001 n=18+19) BM_UFlat/13 [c ] 7.29µs ± 0% 7.34µs ± 1% +0.59% (p=0.000 n=19+20) BM_UFlat/14 [lsp ] 2.28µs ± 0% 2.29µs ± 1% +0.39% (p=0.000 n=19+18) BM_UFlat/15 [xls ] 905µs ± 0% 904µs ± 0% -0.12% (p=0.030 n=20+20) BM_UFlat/16 [xls_200 ] 213ns ± 2% 215ns ± 4% +0.92% (p=0.011 n=20+20) BM_UFlat/17 [bin ] 274µs ± 0% 275µs ± 0% +0.55% (p=0.000 n=20+20) BM_UFlat/18 [bin_200 ] 101ns ± 1% 101ns ± 1% ~ (p=0.913 n=18+18) BM_UFlat/19 [sum ] 27.9µs ± 1% 27.5µs ± 1% -1.38% (p=0.000 n=20+20) BM_UFlat/20 [man ] 2.97µs ± 1% 2.97µs ± 1% ~ (p=0.835 n=20+19) BM_UValidate/0 [html ] 33.5µs ± 0% 34.2µs ± 0% +2.32% (p=0.000 n=20+20) BM_UValidate/1 [urls ] 441µs ± 0% 442µs ± 0% +0.15% (p=0.010 n=20+20) BM_UValidate/2 [jpg ] 144ns ± 0% 146ns ± 0% +1.32% (p=0.000 n=20+20) BM_UValidate/3 [jpg_200 ] 95.3ns ± 0% 96.0ns ± 0% +0.68% (p=0.000 n=20+20) BM_UValidate/4 [pdf ] 2.86µs ± 0% 2.88µs ± 1% +0.67% (p=0.000 n=19+19) BM_UIOVec/0 [html ] 122µs ± 0% 122µs ± 0% -0.25% (p=0.000 n=20+20) BM_UIOVec/1 [urls ] 1.08ms ± 0% 1.08ms ± 0% ~ (p=0.068 n=20+20) BM_UIOVec/2 [jpg ] 7.63µs ± 7% 7.76µs ±11% ~ (p=0.396 n=19+20) BM_UIOVec/3 [jpg_200 ] 325ns ± 0% 326ns ± 0% +0.27% (p=0.000 n=20+18) BM_UIOVec/4 [pdf ] 12.1µs ± 2% 12.1µs ± 3% ~ (p=0.967 n=19+20) BM_UFlatSink/0 [html ] 42.4µs ± 0% 42.1µs ± 0% -0.89% (p=0.000 n=20+20) BM_UFlatSink/1 [urls ] 575µs ± 0% 575µs ± 0% ~ (p=0.883 n=20+20) BM_UFlatSink/2 [jpg ] 7.58µs ±16% 7.52µs ±15% ~ (p=0.945 n=19+20) BM_UFlatSink/3 [jpg_200 ] 133ns ± 4% 133ns ± 4% ~ (p=0.627 n=19+20) BM_UFlatSink/4 [pdf ] 8.29µs ± 4% 8.39µs ± 4% +1.14% (p=0.013 n=19+18) BM_UFlatSink/5 [html4 ] 223µs ± 0% 222µs ± 0% -0.18% (p=0.001 n=20+20) BM_UFlatSink/6 [txt1 ] 192µs ± 0% 191µs ± 0% -0.71% (p=0.000 n=20+20) BM_UFlatSink/7 [txt2 ] 169µs ± 0% 169µs ± 0% -0.26% (p=0.000 n=20+20) BM_UFlatSink/8 [txt3 ] 510µs ± 0% 508µs ± 0% -0.50% (p=0.000 n=20+20) BM_UFlatSink/9 [txt4 ] 707µs ± 0% 704µs ± 0% -0.44% (p=0.000 n=20+20) BM_UFlatSink/10 [pb ] 39.1µs ± 0% 38.5µs ± 1% -1.62% (p=0.000 n=19+20) BM_UFlatSink/11 [gaviota ] 189µs ± 0% 189µs ± 0% -0.39% (p=0.000 n=20+20) BM_UFlatSink/12 [cp ] 14.2µs ± 0% 14.2µs ± 1% ~ (p=0.435 n=19+19) BM_UFlatSink/13 [c ] 7.29µs ± 0% 7.33µs ± 1% +0.57% (p=0.000 n=19+20) BM_UFlatSink/14 [lsp ] 2.29µs ± 0% 2.29µs ± 1% ~ (p=0.791 n=18+18) BM_UFlatSink/15 [xls ] 903µs ± 0% 902µs ± 0% -0.11% (p=0.044 n=20+19) BM_UFlatSink/16 [xls_200 ] 215ns ± 1% 215ns ± 1% ~ (p=0.885 n=19+19) BM_UFlatSink/17 [bin ] 274µs ± 0% 275µs ± 0% +0.51% (p=0.000 n=20+20) BM_UFlatSink/18 [bin_200 ] 103ns ± 2% 103ns ± 0% -0.41% (p=0.016 n=20+15) BM_UFlatSink/19 [sum ] 27.9µs ± 1% 27.5µs ± 1% -1.34% (p=0.000 n=20+19) BM_UFlatSink/20 [man ] 2.98µs ± 1% 2.97µs ± 1% ~ (p=0.358 n=18+19) BM_ZFlat/0 [html (22.31 %) ] 126µs ± 0% 126µs ± 0% +0.14% (p=0.011 n=20+20) BM_ZFlat/1 [urls (47.78 %) ] 1.67ms ± 0% 1.67ms ± 0% +0.11% (p=0.043 n=20+20) BM_ZFlat/2 [jpg (99.95 %) ] 11.5µs ± 6% 11.7µs ± 7% ~ (p=0.142 n=20+20) BM_ZFlat/3 [jpg_200 (73.00 %)] 349ns ± 3% 351ns ± 3% ~ (p=0.573 n=18+20) BM_ZFlat/4 [pdf (83.30 %) ] 14.6µs ± 2% 14.7µs ± 4% ~ (p=0.879 n=19+20) BM_ZFlat/5 [html4 (22.52 %) ] 553µs ± 0% 552µs ± 0% -0.23% (p=0.000 n=20+20) BM_ZFlat/6 [txt1 (57.88 %) ] 540µs ± 0% 540µs ± 0% ~ (p=0.221 n=20+20) BM_ZFlat/7 [txt2 (61.91 %) ] 479µs ± 0% 481µs ± 1% +0.47% (p=0.000 n=20+20) BM_ZFlat/8 [txt3 (54.99 %) ] 1.44ms ± 0% 1.44ms ± 0% +0.13% (p=0.040 n=20+20) BM_ZFlat/9 [txt4 (66.26 %) ] 1.97ms ± 0% 1.97ms ± 0% +0.16% (p=0.009 n=20+20) BM_ZFlat/10 [pb (19.68 %) ] 110µs ± 1% 109µs ± 1% -0.79% (p=0.000 n=20+20) BM_ZFlat/11 [gaviota (37.72 %)] 410µs ± 0% 410µs ± 0% ~ (p=0.149 n=20+19) BM_ZFlat/12 [cp (48.12 %) ] 45.4µs ± 1% 44.9µs ± 1% -1.23% (p=0.000 n=20+20) BM_ZFlat/13 [c (42.47 %) ] 17.5µs ± 0% 17.5µs ± 1% ~ (p=0.883 n=20+20) BM_ZFlat/14 [lsp (48.37 %) ] 5.51µs ± 1% 5.46µs ± 1% -0.95% (p=0.000 n=20+18) BM_ZFlat/15 [xls (41.23 %) ] 1.61ms ± 0% 1.62ms ± 0% ~ (p=0.183 n=20+20) BM_ZFlat/16 [xls_200 (78.00 %)] 389ns ± 2% 391ns ± 3% ~ (p=0.740 n=18+20) BM_ZFlat/17 [bin (18.11 %) ] 508µs ± 0% 508µs ± 0% ~ (p=0.779 n=20+20) BM_ZFlat/18 [bin_200 (7.50 %) ] 87.4ns ± 5% 88.1ns ± 8% ~ (p=0.367 n=16+19) BM_ZFlat/19 [sum (48.96 %) ] 79.1µs ± 0% 80.2µs ± 0% +1.39% (p=0.000 n=20+20) BM_ZFlat/20 [man (59.21 %) ] 7.55µs ± 1% 7.57µs ± 1% +0.31% (p=0.025 n=19+19) name old speed new speed delta BM_UFlat/0 [html ] 2.42GB/s ± 0% 2.44GB/s ± 0% +0.77% (p=0.000 n=19+19) BM_UFlat/1 [urls ] 1.22GB/s ± 0% 1.23GB/s ± 0% +0.06% (p=0.000 n=20+19) BM_UFlat/2 [jpg ] 17.3GB/s ± 2% 17.2GB/s ± 4% ~ (p=0.433 n=17+19) BM_UFlat/3 [jpg_200 ] 1.56GB/s ± 0% 1.54GB/s ± 0% -0.82% (p=0.000 n=20+20) BM_UFlat/4 [pdf ] 12.5GB/s ± 1% 12.5GB/s ± 1% ~ (p=0.322 n=17+17) BM_UFlat/5 [html4 ] 1.85GB/s ± 0% 1.85GB/s ± 0% +0.16% (p=0.000 n=20+20) BM_UFlat/6 [txt1 ] 794MB/s ± 0% 800MB/s ± 0% +0.68% (p=0.000 n=18+20) BM_UFlat/7 [txt2 ] 741MB/s ± 0% 743MB/s ± 0% +0.30% (p=0.000 n=19+19) BM_UFlat/8 [txt3 ] 840MB/s ± 0% 844MB/s ± 0% +0.53% (p=0.000 n=18+20) BM_UFlat/9 [txt4 ] 684MB/s ± 0% 688MB/s ± 0% +0.57% (p=0.000 n=20+17) BM_UFlat/10 [pb ] 3.04GB/s ± 0% 3.09GB/s ± 0% +1.60% (p=0.000 n=19+20) BM_UFlat/11 [gaviota ] 977MB/s ± 0% 981MB/s ± 0% +0.45% (p=0.000 n=19+19) BM_UFlat/12 [cp ] 1.74GB/s ± 0% 1.74GB/s ± 0% +0.29% (p=0.000 n=20+19) BM_UFlat/13 [c ] 1.53GB/s ± 0% 1.52GB/s ± 1% -0.56% (p=0.000 n=19+20) BM_UFlat/14 [lsp ] 1.64GB/s ± 0% 1.63GB/s ± 1% -0.38% (p=0.000 n=19+20) BM_UFlat/15 [xls ] 1.14GB/s ± 0% 1.14GB/s ± 0% +0.11% (p=0.000 n=19+20) BM_UFlat/16 [xls_200 ] 941MB/s ± 1% 931MB/s ± 4% -1.02% (p=0.001 n=19+20) BM_UFlat/17 [bin ] 1.88GB/s ± 0% 1.87GB/s ± 0% -0.51% (p=0.000 n=20+20) BM_UFlat/18 [bin_200 ] 1.98GB/s ± 0% 1.98GB/s ± 1% ~ (p=0.767 n=18+18) BM_UFlat/19 [sum ] 1.37GB/s ± 0% 1.39GB/s ± 0% +1.46% (p=0.000 n=20+20) BM_UFlat/20 [man ] 1.43GB/s ± 0% 1.43GB/s ± 0% ~ (p=0.501 n=18+18) BM_UValidate/0 [html ] 3.07GB/s ± 0% 3.00GB/s ± 0% -2.25% (p=0.000 n=20+20) BM_UValidate/1 [urls ] 1.60GB/s ± 0% 1.59GB/s ± 0% -0.11% (p=0.000 n=18+19) BM_UValidate/2 [jpg ] 859GB/s ± 0% 848GB/s ± 0% -1.29% (p=0.000 n=20+19) BM_UValidate/3 [jpg_200 ] 2.10GB/s ± 0% 2.09GB/s ± 0% -0.68% (p=0.000 n=19+20) BM_UValidate/4 [pdf ] 35.9GB/s ± 0% 35.6GB/s ± 1% -0.71% (p=0.000 n=20+20) BM_UIOVec/0 [html ] 843MB/s ± 0% 844MB/s ± 0% +0.21% (p=0.000 n=20+20) BM_UIOVec/1 [urls ] 651MB/s ± 0% 650MB/s ± 0% -0.10% (p=0.000 n=20+20) BM_UIOVec/2 [jpg ] 16.2GB/s ± 6% 16.0GB/s ±10% ~ (p=0.380 n=19+20) BM_UIOVec/3 [jpg_200 ] 617MB/s ± 0% 615MB/s ± 0% -0.24% (p=0.000 n=20+17) BM_UIOVec/4 [pdf ] 8.52GB/s ± 3% 8.50GB/s ± 3% ~ (p=0.771 n=19+20) BM_UFlatSink/0 [html ] 2.42GB/s ± 0% 2.44GB/s ± 0% +0.93% (p=0.000 n=20+20) BM_UFlatSink/1 [urls ] 1.23GB/s ± 0% 1.23GB/s ± 0% +0.04% (p=0.006 n=20+20) BM_UFlatSink/2 [jpg ] 16.4GB/s ±14% 16.5GB/s ±13% ~ (p=0.879 n=19+20) BM_UFlatSink/3 [jpg_200 ] 1.51GB/s ± 4% 1.51GB/s ± 4% ~ (p=0.874 n=18+20) BM_UFlatSink/4 [pdf ] 12.4GB/s ± 4% 12.3GB/s ± 4% -1.11% (p=0.016 n=19+18) BM_UFlatSink/5 [html4 ] 1.85GB/s ± 0% 1.85GB/s ± 0% +0.20% (p=0.000 n=20+20) BM_UFlatSink/6 [txt1 ] 794MB/s ± 0% 799MB/s ± 0% +0.72% (p=0.000 n=19+20) BM_UFlatSink/7 [txt2 ] 741MB/s ± 0% 743MB/s ± 0% +0.30% (p=0.000 n=18+20) BM_UFlatSink/8 [txt3 ] 839MB/s ± 0% 843MB/s ± 0% +0.52% (p=0.000 n=20+18) BM_UFlatSink/9 [txt4 ] 684MB/s ± 0% 687MB/s ± 0% +0.46% (p=0.000 n=20+20) BM_UFlatSink/10 [pb ] 3.04GB/s ± 0% 3.09GB/s ± 0% +1.71% (p=0.000 n=20+19) BM_UFlatSink/11 [gaviota ] 976MB/s ± 0% 980MB/s ± 0% +0.45% (p=0.000 n=20+20) BM_UFlatSink/12 [cp ] 1.74GB/s ± 1% 1.74GB/s ± 1% ~ (p=0.904 n=20+20) BM_UFlatSink/13 [c ] 1.53GB/s ± 0% 1.53GB/s ± 1% -0.50% (p=0.000 n=19+20) BM_UFlatSink/14 [lsp ] 1.63GB/s ± 1% 1.63GB/s ± 1% ~ (p=0.358 n=19+18) BM_UFlatSink/15 [xls ] 1.14GB/s ± 0% 1.15GB/s ± 0% +0.12% (p=0.000 n=20+20) BM_UFlatSink/16 [xls_200 ] 931MB/s ± 1% 931MB/s ± 1% ~ (p=0.686 n=19+19) BM_UFlatSink/17 [bin ] 1.88GB/s ± 0% 1.87GB/s ± 0% -0.53% (p=0.000 n=20+20) BM_UFlatSink/18 [bin_200 ] 1.94GB/s ± 2% 1.95GB/s ± 1% +0.42% (p=0.014 n=20+15) BM_UFlatSink/19 [sum ] 1.37GB/s ± 0% 1.39GB/s ± 0% +1.38% (p=0.000 n=19+18) BM_UFlatSink/20 [man ] 1.42GB/s ± 1% 1.43GB/s ± 0% ~ (p=0.284 n=18+19) BM_ZFlat/0 [html (22.31 %) ] 815MB/s ± 0% 814MB/s ± 0% -0.15% (p=0.000 n=20+20) BM_ZFlat/1 [urls (47.78 %) ] 423MB/s ± 0% 422MB/s ± 0% -0.14% (p=0.000 n=20+20) BM_ZFlat/2 [jpg (99.95 %) ] 10.8GB/s ± 5% 10.6GB/s ± 7% ~ (p=0.142 n=20+20) BM_ZFlat/3 [jpg_200 (73.00 %)] 574MB/s ± 2% 572MB/s ± 2% ~ (p=0.613 n=18+20) BM_ZFlat/4 [pdf (83.30 %) ] 7.01GB/s ± 2% 7.01GB/s ± 4% ~ (p=0.593 n=18+20) BM_ZFlat/5 [html4 (22.52 %) ] 743MB/s ± 0% 745MB/s ± 0% +0.25% (p=0.000 n=20+19) BM_ZFlat/6 [txt1 (57.88 %) ] 283MB/s ± 0% 282MB/s ± 0% ~ (p=0.261 n=18+19) BM_ZFlat/7 [txt2 (61.91 %) ] 262MB/s ± 0% 261MB/s ± 0% -0.35% (p=0.000 n=20+19) BM_ZFlat/8 [txt3 (54.99 %) ] 298MB/s ± 0% 297MB/s ± 0% -0.11% (p=0.000 n=20+19) BM_ZFlat/9 [txt4 (66.26 %) ] 245MB/s ± 0% 245MB/s ± 0% -0.13% (p=0.000 n=19+20) BM_ZFlat/10 [pb (19.68 %) ] 1.08GB/s ± 0% 1.09GB/s ± 0% +0.82% (p=0.000 n=18+19) BM_ZFlat/11 [gaviota (37.72 %)] 451MB/s ± 0% 451MB/s ± 0% -0.05% (p=0.004 n=19+20) BM_ZFlat/12 [cp (48.12 %) ] 543MB/s ± 1% 550MB/s ± 1% +1.24% (p=0.000 n=20+20) BM_ZFlat/13 [c (42.47 %) ] 638MB/s ± 0% 637MB/s ± 0% ~ (p=0.708 n=19+19) BM_ZFlat/14 [lsp (48.37 %) ] 678MB/s ± 2% 684MB/s ± 1% +0.89% (p=0.000 n=20+19) BM_ZFlat/15 [xls (41.23 %) ] 640MB/s ± 0% 640MB/s ± 0% -0.10% (p=0.000 n=19+19) BM_ZFlat/16 [xls_200 (78.00 %)] 515MB/s ± 2% 514MB/s ± 3% ~ (p=0.916 n=18+19) BM_ZFlat/17 [bin (18.11 %) ] 1.01GB/s ± 0% 1.01GB/s ± 0% +0.03% (p=0.033 n=20+20) BM_ZFlat/18 [bin_200 (7.50 %) ] 2.30GB/s ± 6% 2.28GB/s ± 9% ~ (p=0.502 n=16+19) BM_ZFlat/19 [sum (48.96 %) ] 485MB/s ± 0% 478MB/s ± 0% -1.39% (p=0.000 n=19+20) BM_ZFlat/20 [man (59.21 %) ] 562MB/s ± 1% 560MB/s ± 1% -0.37% (p=0.016 n=18+19)
-rw-r--r--snappy.cc18
1 files changed, 7 insertions, 11 deletions
diff --git a/snappy.cc b/snappy.cc
index 4ab26d7..6dcb4bb 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -732,17 +732,13 @@ static inline uint32 ExtractLowBytes(uint32 v, int n) {
#endif
}
-static inline bool LeftShiftOverflows(uint32 value, uint32 shift) {
+static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
DCHECK_LT(shift, 32);
- static const uint32 masks[] = {
- 0x00000000, 0x80000000, 0xc0000000, 0xe0000000, //
- 0xf0000000, 0xf8000000, 0xfc000000, 0xfe000000, //
- 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, //
- 0xfff00000, 0xfff80000, 0xfffc0000, 0xfffe0000, //
- 0xffff0000, 0xffff8000, 0xffffc000, 0xffffe000, //
- 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, //
- 0xffffff00, 0xffffff80, 0xffffffc0, 0xffffffe0, //
- 0xfffffff0, 0xfffffff8, 0xfffffffc, 0xfffffffe};
+ static const uint8 masks[] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
return (value & masks[shift]) != 0;
}
@@ -798,7 +794,7 @@ class SnappyDecompressor {
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
reader_->Skip(1);
uint32 val = c & 0x7f;
- if (LeftShiftOverflows(val, shift)) return false;
+ if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
*result |= val << shift;
if (c < 128) {
break;