Improve the speed of hashing in zippy compression.

This change replaces the hashing function used during compression with one that is roughly as good but faster. This speeds up compression by two to a few percent on the Intel-, AMD-, and Arm-based machines we tested. The amount of compression is roughly unchanged. PiperOrigin-RevId: 485960303
author: Snappy Team <no-reply@google.com> 2022-11-03 20:36:33 +0000
committer: Victor Costan <pwnall@chromium.org> 2023-01-12 13:32:54 +0000
commit: 8881ba172a32913435ef570564bc90123d596693 (patch)
tree: e495127ea017359fe2bf88916e7d8e1c1e2a90c4
parent: a2d219a8a801ae522bac8e966de005fcb336821b (diff)
download: snappy-git-8881ba172a32913435ef570564bc90123d596693.tar.gz
3 files changed, 79 insertions, 20 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6eef485..2a0bc10 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -175,6 +175,19 @@ int main() {
 check_cxx_source_compiles("
 #include <immintrin.h>
 int main() {
+  return _mm_crc32_u32(0, 1);
+}" SNAPPY_HAVE_X86_CRC32)
+
+check_cxx_source_compiles("
+#include <arm_neon.h>
+#include <arm_acle.h>
+int main() {
+  return __crc32cw(0, 1);
+}" SNAPPY_HAVE_NEON_CRC32)
+
+check_cxx_source_compiles("
+#include <immintrin.h>
+int main() {
   return _bzhi_u32(0, 1);
 }" SNAPPY_HAVE_BMI2)
 
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 5ea2b5a..d1de25c 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -46,12 +46,19 @@
 /* Define to 1 if you target processors with SSSE3+ and have <tmmintrin.h>. */
 #cmakedefine01 SNAPPY_HAVE_SSSE3
 
+/* Define to 1 if you target processors with SSE4.2 and have <crc32intrin.h>. */
+#cmakedefine01 SNAPPY_HAVE_X86_CRC32
+
 /* Define to 1 if you target processors with BMI2+ and have <bmi2intrin.h>. */
 #cmakedefine01 SNAPPY_HAVE_BMI2
 
 /* Define to 1 if you target processors with NEON and have <arm_neon.h>. */
 #cmakedefine01 SNAPPY_HAVE_NEON
 
+/* Define to 1 if you have <arm_neon.h> and <arm_acle.h> and want to optimize
+   compression speed by using __crc32cw from <arm_acle.h>. */
+#cmakedefine01 SNAPPY_HAVE_NEON_CRC32
+
 /* Define to 1 if your processor stores words with the most significant byte
    first (like Motorola and SPARC, unlike Intel and VAX). */
 #cmakedefine01 SNAPPY_IS_BIG_ENDIAN
diff --git a/snappy.cc b/snappy.cc
index 932f59f..57d7319 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -45,10 +45,28 @@
 #endif
 #endif  // !defined(SNAPPY_HAVE_BMI2)
 
-#if SNAPPY_HAVE_BMI2
+#if !defined(SNAPPY_HAVE_X86_CRC32)
+#if defined(__SSE4_2__)
+#define SNAPPY_HAVE_X86_CRC32 1
+#else
+#define SNAPPY_HAVE_X86_CRC32 0
+#endif
+#endif  // !defined(SNAPPY_HAVE_X86_CRC32)
+
+#if !defined(SNAPPY_HAVE_NEON_CRC32)
+#if SNAPPY_HAVE_NEON && defined(__ARM_FEATURE_CRC32)
+#define SNAPPY_HAVE_NEON_CRC32 1
+#else
+#define SNAPPY_HAVE_NEON_CRC32 0
+#endif
+#endif  // !defined(SNAPPY_HAVE_NEON_CRC32)
+
+#if SNAPPY_HAVE_BMI2 || SNAPPY_HAVE_X86_CRC32
 // Please do not replace with <x86intrin.h>. or with headers that assume more
 // advanced SSE versions without checking with all the OWNERS.
 #include <immintrin.h>
+#elif SNAPPY_HAVE_NEON_CRC32
+#include <arm_acle.h>
 #endif
 
 #include <algorithm>
@@ -127,14 +145,34 @@ constexpr std::array<int16_t, 256> MakeTable(index_sequence<seq...>) {
 alignas(64) const std::array<int16_t, 256> kLengthMinusOffset =
     MakeTable(make_index_sequence<256>{});
 
-// Any hash function will produce a valid compressed bitstream, but a good
-// hash function reduces the number of collisions and thus yields better
-// compression for compressible input, and more speed for incompressible
-// input. Of course, it doesn't hurt if the hash function is reasonably fast
-// either, as it gets called a lot.
-inline uint32_t HashBytes(uint32_t bytes, uint32_t mask) {
+// Given a table of uint16_t whose size is mask / 2 + 1, return a pointer to the
+// relevant entry, if any, for the given bytes.  Any hash function will do,
+// but a good hash function reduces the number of collisions and thus yields
+// better compression for compressible input.
+//
+// REQUIRES: mask is 2 * (table_size - 1), and table_size is a power of two.
+inline uint16_t* TableEntry(uint16_t* table, uint32_t bytes, uint32_t mask) {
+  // Our choice is quicker-and-dirtier than the typical hash function;
+  // empirically, that seems beneficial.  The upper bits of kMagic * bytes are a
+  // higher-quality hash than the lower bits, so when using kMagic * bytes we
+  // also shift right to get a higher-quality end result.  There's no similar
+  // issue with a CRC because all of the output bits of a CRC are equally good
+  // "hashes." So, a CPU instruction for CRC, if available, tends to be a good
+  // choice.
+#if SNAPPY_HAVE_NEON_CRC32
+  // We use mask as the second arg to the CRC function, as it's about to
+  // be used anyway; it'd be equally correct to use 0 or some constant.
+  // Mathematically, _mm_crc32_u32 (or similar) is a function of the
+  // xor of its arguments.
+  const uint32_t hash = __crc32cw(bytes, mask);
+#elif SNAPPY_HAVE_X86_CRC32
+  const uint32_t hash = _mm_crc32_u32(bytes, mask);
+#else
   constexpr uint32_t kMagic = 0x1e35a7bd;
-  return ((kMagic * bytes) >> (32 - kMaxHashTableBits)) & mask;
+  const uint32_t hash = (kMagic * bytes) >> (31 - kMaxHashTableBits);
+#endif
+  return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) +
+                                     (hash & mask));
 }
 
 }  // namespace
@@ -727,7 +765,7 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
   const char* ip = input;
   assert(input_size <= kBlockSize);
   assert((table_size & (table_size - 1)) == 0);  // table must be power of two
-  const uint32_t mask = table_size - 1;
+  const uint32_t mask = 2 * (table_size - 1);
   const char* ip_end = input + input_size;
   const char* base_ip = ip;
 
@@ -778,11 +816,11 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
             // loaded in preload.
             uint32_t dword = i == 0 ? preload : static_cast<uint32_t>(data);
             assert(dword == LittleEndian::Load32(ip + i));
-            uint32_t hash = HashBytes(dword, mask);
-            candidate = base_ip + table[hash];
+            uint16_t* table_entry = TableEntry(table, dword, mask);
+            candidate = base_ip + *table_entry;
             assert(candidate >= base_ip);
             assert(candidate < ip + i);
-            table[hash] = delta + i;
+            *table_entry = delta + i;
             if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) == dword)) {
               *op = LITERAL | (i << 2);
               UnalignedCopy128(next_emit, op + 1);
@@ -799,7 +837,7 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
       }
       while (true) {
         assert(static_cast<uint32_t>(data) == LittleEndian::Load32(ip));
-        uint32_t hash = HashBytes(data, mask);
+        uint16_t* table_entry = TableEntry(table, data, mask);
         uint32_t bytes_between_hash_lookups = skip >> 5;
         skip += bytes_between_hash_lookups;
         const char* next_ip = ip + bytes_between_hash_lookups;
@@ -807,11 +845,11 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
           ip = next_emit;
           goto emit_remainder;
         }
-        candidate = base_ip + table[hash];
+        candidate = base_ip + *table_entry;
         assert(candidate >= base_ip);
         assert(candidate < ip);
 
-        table[hash] = ip - base_ip;
+        *table_entry = ip - base_ip;
         if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) ==
                                 LittleEndian::Load32(candidate))) {
           break;
@@ -857,12 +895,13 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
         assert((data & 0xFFFFFFFFFF) ==
                (LittleEndian::Load64(ip) & 0xFFFFFFFFFF));
         // We are now looking for a 4-byte match again.  We read
-        // table[Hash(ip, shift)] for that.  To improve compression,
+        // table[Hash(ip, mask)] for that.  To improve compression,
         // we also update table[Hash(ip - 1, mask)] and table[Hash(ip, mask)].
-        table[HashBytes(LittleEndian::Load32(ip - 1), mask)] = ip - base_ip - 1;
-        uint32_t hash = HashBytes(data, mask);
-        candidate = base_ip + table[hash];
-        table[hash] = ip - base_ip;
+        *TableEntry(table, LittleEndian::Load32(ip - 1), mask) =
+            ip - base_ip - 1;
+        uint16_t* table_entry = TableEntry(table, data, mask);
+        candidate = base_ip + *table_entry;
+        *table_entry = ip - base_ip;
         // Measurements on the benchmarks have shown the following probabilities
         // for the loop to exit (ie. avg. number of iterations is reciprocal).
         // BM_Flat/6  txt1    p = 0.3-0.4
author	Snappy Team <no-reply@google.com>	2022-11-03 20:36:33 +0000
committer	Victor Costan <pwnall@chromium.org>	2023-01-12 13:32:54 +0000
commit	8881ba172a32913435ef570564bc90123d596693 (patch)
tree	e495127ea017359fe2bf88916e7d8e1c1e2a90c4
parent	a2d219a8a801ae522bac8e966de005fcb336821b (diff)
download	snappy-git-8881ba172a32913435ef570564bc90123d596693.tar.gz