From 07aad56d4785780c2bbf5fedd69675dd16e13517 Mon Sep 17 00:00:00 2001
From: "snappy.mirrorbot@gmail.com"
 <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>
Date: Thu, 23 Feb 2012 17:00:36 +0000
Subject: For 32-bit platforms, do not try to accelerate multiple neighboring
 32-bit loads with a 64-bit load during compression (it's not a win).

The main target for this optimization is ARM, but 32-bit x86 gets
a small gain, too, although there is noise in the microbenchmarks.
It's a no-op for 64-bit x86. It does not affect decompression.

Microbenchmark results on a Cortex-A9 1GHz, using g++ 4.6.2 (from
Ubuntu/Linaro), -O2 -DNDEBUG -Wa,-march=armv7a -mtune=cortex-a9
-mthumb-interwork, minimum 1000 iterations:

  Benchmark            Time(ns)    CPU(ns) Iterations
  ---------------------------------------------------
  BM_ZFlat/0            1158277    1160000       1000 84.2MB/s  html (23.57 %)    [ +4.3%]
  BM_ZFlat/1           14861782   14860000       1000 45.1MB/s  urls (50.89 %)    [ +1.1%]
  BM_ZFlat/2             393595     390000       1000 310.5MB/s  jpg (99.88 %)    [ +0.0%]
  BM_ZFlat/3             650583     650000       1000 138.4MB/s  pdf (82.13 %)    [ +3.1%]
  BM_ZFlat/4            4661480    4660000       1000 83.8MB/s  html4 (23.55 %)   [ +4.3%]
  BM_ZFlat/5             491973     490000       1000 47.9MB/s  cp (48.12 %)      [ +2.0%]
  BM_ZFlat/6             193575     192678       1038 55.2MB/s  c (42.40 %)       [ +9.0%]
  BM_ZFlat/7              62343      62754       3187 56.5MB/s  lsp (48.37 %)     [ +2.6%]
  BM_ZFlat/8           17708468   17710000       1000 55.5MB/s  xls (41.34 %)     [ -0.3%]
  BM_ZFlat/9            3755345    3760000       1000 38.6MB/s  txt1 (59.81 %)    [ +8.2%]
  BM_ZFlat/10           3324217    3320000       1000 36.0MB/s  txt2 (64.07 %)    [ +4.2%]
  BM_ZFlat/11          10139932   10140000       1000 40.1MB/s  txt3 (57.11 %)    [ +6.4%]
  BM_ZFlat/12          13532109   13530000       1000 34.0MB/s  txt4 (68.35 %)    [ +5.0%]
  BM_ZFlat/13           4690847    4690000       1000 104.4MB/s  bin (18.21 %)    [ +4.1%]
  BM_ZFlat/14            830682     830000       1000 43.9MB/s  sum (51.88 %)     [ +1.2%]
  BM_ZFlat/15             84784      85011       2235 47.4MB/s  man (59.36 %)     [ +1.1%]
  BM_ZFlat/16           1293254    1290000       1000 87.7MB/s  pb (23.15 %)      [ +2.3%]
  BM_ZFlat/17           2775155    2780000       1000 63.2MB/s  gaviota (38.27 %) [+12.2%]

Core i7 in 32-bit mode (only one run and 100 iterations, though, so noisy):

  Benchmark            Time(ns)    CPU(ns) Iterations
  ---------------------------------------------------
  BM_ZFlat/0             227582     223464       3043 437.0MB/s  html (23.57 %)    [ +7.4%]
  BM_ZFlat/1            2982430    2918455        233 229.4MB/s  urls (50.89 %)    [ +2.9%]
  BM_ZFlat/2              46967      46658      15217 2.5GB/s  jpg (99.88 %)       [ +0.0%]
  BM_ZFlat/3             115298     114864       5833 783.2MB/s  pdf (82.13 %)     [ +1.5%]
  BM_ZFlat/4             913440     899743        778 434.2MB/s  html4 (23.55 %)   [ +0.3%]
  BM_ZFlat/5             110302     108571       7000 216.1MB/s  cp (48.12 %)      [ +0.0%]
  BM_ZFlat/6              44409      43372      15909 245.2MB/s  c (42.40 %)       [ +0.8%]
  BM_ZFlat/7              15713      15643      46667 226.9MB/s  lsp (48.37 %)     [ +2.7%]
  BM_ZFlat/8            2625539    2602230        269 377.4MB/s  xls (41.34 %)     [ +1.4%]
  BM_ZFlat/9             808884     811429        875 178.8MB/s  txt1 (59.81 %)    [ -3.9%]
  BM_ZFlat/10            709532     700000       1000 170.5MB/s  txt2 (64.07 %)    [ +0.0%]
  BM_ZFlat/11           2177682    2162162        333 188.2MB/s  txt3 (57.11 %)    [ -1.4%]
  BM_ZFlat/12           2849640    2840000        250 161.8MB/s  txt4 (68.35 %)    [ -1.4%]
  BM_ZFlat/13            849760     835476        778 585.8MB/s  bin (18.21 %)     [ +1.2%]
  BM_ZFlat/14            165940     164571       4375 221.6MB/s  sum (51.88 %)     [ +1.4%]
  BM_ZFlat/15             20939      20571      35000 196.0MB/s  man (59.36 %)     [ +2.1%]
  BM_ZFlat/16            239209     236544       2917 478.1MB/s  pb (23.15 %)      [ +4.2%]
  BM_ZFlat/17            616206     610000       1000 288.2MB/s  gaviota (38.27 %) [ -1.6%]

R=sanjay


git-svn-id: http://snappy.googlecode.com/svn/trunk@60 03e5f5b5-db94-4691-08a0-1a8bf15f6143
---
 snappy.cc | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

(limited to 'snappy.cc')

diff --git a/snappy.cc b/snappy.cc
index 6c6bd6b..4d4eb42 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -272,16 +272,49 @@ uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
 }
 }  // end namespace internal
 
-// For 0 <= offset <= 4, GetUint32AtOffset(UNALIGNED_LOAD64(p), offset) will
+// For 0 <= offset <= 4, GetUint32AtOffset(GetEightBytesAt(p), offset) will
 // equal UNALIGNED_LOAD32(p + offset).  Motivation: On x86-64 hardware we have
 // empirically found that overlapping loads such as
 //  UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
 // are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
+//
+// We have different versions for 64- and 32-bit; ideally we would avoid the
+// two functions and just inline the UNALIGNED_LOAD64 call into
+// GetUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever
+// enough to avoid loading the value multiple times then. For 64-bit, the load
+// is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
+// done at GetUint32AtOffset() time.
+
+#ifdef ARCH_K8
+
+typedef uint64 EightBytesReference;
+
+static inline EightBytesReference GetEightBytesAt(const char* ptr) {
+  return UNALIGNED_LOAD64(ptr);
+}
+
 static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
-  DCHECK(0 <= offset && offset <= 4) << offset;
+  DCHECK_GE(offset, 0);
+  DCHECK_LE(offset, 4);
   return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
 }
 
+#else
+
+typedef const char* EightBytesReference;
+
+static inline EightBytesReference GetEightBytesAt(const char* ptr) {
+  return ptr;
+}
+
+static inline uint32 GetUint32AtOffset(const char* v, int offset) {
+  DCHECK_GE(offset, 0);
+  DCHECK_LE(offset, 4);
+  return UNALIGNED_LOAD32(v + offset);
+}
+
+#endif
+
 // Flat array compression that does not emit the "uncompressed length"
 // prefix. Compresses "input" string to the "*op" buffer.
 //
@@ -378,7 +411,7 @@ char* CompressFragment(const char* input,
       // though we don't yet know how big the literal will be.  We handle that
       // by proceeding to the next iteration of the main loop.  We also can exit
       // this loop via goto if we get close to exhausting the input.
-      uint64 input_bytes = 0;
+      EightBytesReference input_bytes;
       uint32 candidate_bytes = 0;
 
       do {
@@ -397,7 +430,7 @@ char* CompressFragment(const char* input,
         if (PREDICT_FALSE(ip >= ip_limit)) {
           goto emit_remainder;
         }
-        input_bytes = UNALIGNED_LOAD64(insert_tail);
+        input_bytes = GetEightBytesAt(insert_tail);
         uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
         table[prev_hash] = ip - base_ip - 1;
         uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
-- 
cgit v1.2.1