diff options
author | costan <costan@google.com> | 2017-10-10 06:05:17 -0700 |
---|---|---|
committer | Victor Costan <pwnall@chromium.org> | 2017-10-10 11:46:40 -0700 |
commit | 5c39524f3639e6bf6ab49215152d24273e662986 (patch) | |
tree | 5db7a01ab37b983955173f185ae83bf22c75d7c5 /port | |
parent | ca216e493f32278f50a823811ab95f64cf0f839b (diff) | |
download | leveldb-5c39524f3639e6bf6ab49215152d24273e662986.tar.gz |
Replace SSE-optimized CRC32C in POSIX port with external library.
Maintaining a hardware-accelerated CRC32C implementation tailored for
all modern platforms deserves a repository of its own. We extracted the
implementation here into https://github.com/google/crc32c and improved
it in that repository. This CL removes the SSE-optimized implementation
from this codebase, and adds the ability to use the google/crc32c
library, if it is present on the system.
The benchmarks below show the performance impact of the change. In
summary, open source builds that use the google/crc32c library can
expect a 3x improvement in CRC32C throughput, whereas builds that do not
use the library will see a 50% drop in CRC32C throughput. This
translates in much smaller changes in overall leveldb performance.
Baseline, MacBookPro13,3 with Core i7 6920HQ:
LevelDB: version 1.20
Keys: 16 bytes each
Values: 100 bytes each (50 bytes after compression)
Entries: 1000000
RawSize: 110.6 MB (estimated)
FileSize: 62.9 MB (estimated)
------------------------------------------------
fillseq : 3.064 micros/op; 36.1 MB/s
fillsync : 57.861 micros/op; 1.9 MB/s (1000 ops)
fillrandom : 3.887 micros/op; 28.5 MB/s
overwrite : 4.140 micros/op; 26.7 MB/s
readrandom : 7.433 micros/op; (1000000 of 1000000 found)
readrandom : 6.825 micros/op; (1000000 of 1000000 found)
readseq : 0.244 micros/op; 453.4 MB/s
readreverse : 0.387 micros/op; 285.8 MB/s
compact : 449707.000 micros/op;
readrandom : 4.196 micros/op; (1000000 of 1000000 found)
readseq : 0.228 micros/op; 485.8 MB/s
readreverse : 0.320 micros/op; 345.2 MB/s
fill100K : 562.556 micros/op; 169.6 MB/s (1000 ops)
crc32c : 0.768 micros/op; 5085.0 MB/s (4K per op)
snappycomp : 4.220 micros/op; 925.7 MB/s (output: 55.1%)
snappyuncomp : 0.635 micros/op; 6155.7 MB/s
acquireload : 13.054 micros/op; (each op is 1000 loads)
New with crc32c, MacBookPro13,3 with Core i7 6920HQ:
LevelDB: version 1.20
Keys: 16 bytes each
Values: 100 bytes each (50 bytes after compression)
Entries: 1000000
RawSize: 110.6 MB (estimated)
FileSize: 62.9 MB (estimated)
------------------------------------------------
fillseq : 2.820 micros/op; 39.2 MB/s
fillsync : 51.988 micros/op; 2.1 MB/s (1000 ops)
fillrandom : 3.747 micros/op; 29.5 MB/s
overwrite : 4.047 micros/op; 27.3 MB/s
readrandom : 7.287 micros/op; (1000000 of 1000000 found)
readrandom : 6.927 micros/op; (1000000 of 1000000 found)
readseq : 0.253 micros/op; 437.5 MB/s
readreverse : 0.411 micros/op; 269.2 MB/s
compact : 440405.000 micros/op;
readrandom : 4.159 micros/op; (1000000 of 1000000 found)
readseq : 0.230 micros/op; 481.1 MB/s
readreverse : 0.320 micros/op; 345.9 MB/s
fill100K : 558.222 micros/op; 170.9 MB/s (1000 ops)
crc32c : 0.214 micros/op; 18263.5 MB/s (4K per op)
snappycomp : 4.471 micros/op; 873.7 MB/s (output: 55.1%)
snappyuncomp : 0.833 micros/op; 4688.5 MB/s
acquireload : 13.289 micros/op; (each op is 1000 loads)
New without crc32c, MacBookPro13,3 with Core i7 6920HQ
LevelDB: version 1.20
Keys: 16 bytes each
Values: 100 bytes each (50 bytes after compression)
Entries: 1000000
RawSize: 110.6 MB (estimated)
FileSize: 62.9 MB (estimated)
------------------------------------------------
fillseq : 3.094 micros/op; 35.8 MB/s
fillsync : 52.160 micros/op; 2.1 MB/s (1000 ops)
fillrandom : 4.090 micros/op; 27.0 MB/s
overwrite : 4.006 micros/op; 27.6 MB/s
readrandom : 6.584 micros/op; (1000000 of 1000000 found)
readrandom : 6.676 micros/op; (1000000 of 1000000 found)
readseq : 0.280 micros/op; 395.2 MB/s
readreverse : 0.391 micros/op; 283.2 MB/s
compact : 433911.000 micros/op;
readrandom : 4.261 micros/op; (1000000 of 1000000 found)
readseq : 0.251 micros/op; 440.5 MB/s
readreverse : 0.356 micros/op; 310.9 MB/s
fill100K : 584.023 micros/op; 163.3 MB/s (1000 ops)
crc32c : 1.384 micros/op; 2822.3 MB/s (4K per op)
snappycomp : 4.763 micros/op; 820.1 MB/s (output: 55.1%)
snappyuncomp : 0.766 micros/op; 5098.6 MB/s
acquireload : 12.931 micros/op; (each op is 1000 loads)
-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=171667771
Diffstat (limited to 'port')
-rw-r--r-- | port/port_posix.h | 15 | ||||
-rw-r--r-- | port/port_posix_sse.cc | 133 |
2 files changed, 12 insertions, 136 deletions
diff --git a/port/port_posix.h b/port/port_posix.h index 2bc5223..2a30be3 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -39,6 +39,9 @@ #endif #include <pthread.h> +#if defined(HAVE_CRC32C) +#include <crc32c/crc32c.h> +#endif // defined(HAVE_CRC32C) #ifdef HAVE_SNAPPY #include <snappy.h> #endif // defined(HAVE_SNAPPY) @@ -139,9 +142,15 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; } -uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); +inline uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) { +#if defined(HAVE_CRC32C) + return ::crc32c::Extend(crc, reinterpret_cast<const uint8_t*>(buf), size); +#else + return 0; +#endif // defined(HAVE_CRC32C) +} -} // namespace port -} // namespace leveldb +} // namespace port +} // namespace leveldb #endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/port/port_posix_sse.cc b/port/port_posix_sse.cc deleted file mode 100644 index 08d9aee..0000000 --- a/port/port_posix_sse.cc +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2016 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A portable implementation of crc32c, optimized to handle -// four bytes at a time. -// -// In a separate source file to allow this accelerated CRC32C function to be -// compiled with the appropriate compiler flags to enable x86 SSE 4.2 -// instructions. - -#include <stdint.h> -#include <string.h> -#include "port/port.h" - -#if defined(LEVELDB_PLATFORM_POSIX_SSE) - -#if defined(_MSC_VER) -#include <intrin.h> -#elif defined(__GNUC__) && defined(__SSE4_2__) -#include <nmmintrin.h> -#include <cpuid.h> -#endif - -#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) - -namespace leveldb { -namespace port { - -#if defined(LEVELDB_PLATFORM_POSIX_SSE) - -// Used to fetch a naturally-aligned 32-bit word in little endian byte-order -static inline uint32_t LE_LOAD32(const uint8_t *p) { - // SSE is x86 only, so ensured that |p| is always little-endian. - uint32_t word; - memcpy(&word, p, sizeof(word)); - return word; -} - -#if defined(_M_X64) || defined(__x86_64__) // LE_LOAD64 is only used on x64. - -// Used to fetch a naturally-aligned 64-bit word in little endian byte-order -static inline uint64_t LE_LOAD64(const uint8_t *p) { - uint64_t dword; - memcpy(&dword, p, sizeof(dword)); - return dword; -} - -#endif // defined(_M_X64) || defined(__x86_64__) - -static inline bool HaveSSE42() { -#if defined(_MSC_VER) - int cpu_info[4]; - __cpuid(cpu_info, 1); - return (cpu_info[2] & (1 << 20)) != 0; -#elif defined(__GNUC__) - unsigned int eax, ebx, ecx, edx; - __get_cpuid(1, &eax, &ebx, &ecx, &edx); - return (ecx & (1 << 20)) != 0; -#else - return false; -#endif -} - -#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) - -// For further improvements see Intel publication at: -// http://download.intel.com/design/intarch/papers/323405.pdf -uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) { -#if !defined(LEVELDB_PLATFORM_POSIX_SSE) - return 0; -#else - static bool have = HaveSSE42(); - if (!have) { - return 0; - } - - const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); - const uint8_t *e = p + size; - uint32_t l = crc ^ 0xffffffffu; - -#define STEP1 do { \ - l = _mm_crc32_u8(l, *p++); \ -} while (0) -#define STEP4 do { \ - l = _mm_crc32_u32(l, LE_LOAD32(p)); \ - p += 4; \ -} while (0) -#define STEP8 do { \ - l = _mm_crc32_u64(l, LE_LOAD64(p)); \ - p += 8; \ -} while (0) - - if (size > 16) { - // Point x at first 8-byte aligned byte in string. This must be inside the - // string, due to the size check above. - const uintptr_t pval = reinterpret_cast<uintptr_t>(p); - const uint8_t* x = reinterpret_cast<const uint8_t*>(((pval + 7) >> 3) << 3); - // Process bytes until p is 8-byte aligned. - while (p != x) { - STEP1; - } - - // _mm_crc32_u64 is only available on x64. -#if defined(_M_X64) || defined(__x86_64__) - // Process 8 bytes at a time - while ((e-p) >= 8) { - STEP8; - } - // Process 4 bytes at a time - if ((e-p) >= 4) { - STEP4; - } -#else // !(defined(_M_X64) || defined(__x86_64__)) - // Process 4 bytes at a time - while ((e-p) >= 4) { - STEP4; - } -#endif // defined(_M_X64) || defined(__x86_64__) - } - // Process the last few bytes - while (p != e) { - STEP1; - } -#undef STEP8 -#undef STEP4 -#undef STEP1 - return l ^ 0xffffffffu; -#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) -} - -} // namespace port -} // namespace leveldb |