diff options
author | costan <costan@google.com> | 2017-02-27 14:29:18 -0800 |
---|---|---|
committer | Victor Costan <pwnall@chromium.org> | 2017-02-28 14:08:46 -0800 |
commit | ea175e28f8ef7f6a8f5931ebad1835d95ec466ed (patch) | |
tree | 8c52c8a394b42444589e4d3929093d03982f32ef /port | |
parent | 95cd743e5e71c7b06e7149a837e33b91309dfa48 (diff) | |
download | leveldb-ea175e28f8ef7f6a8f5931ebad1835d95ec466ed.tar.gz |
Implement support for Intel crc32 instruction (SSE 4.2)
This change authored by vadimskipin and submitted via:
https://github.com/google/leveldb/pull/309
Changes made to support iOS builds and other architectures
without support for SSE 4.2.
db_bench reports original crc32 speed at:
crc32c : 3.610 micros/op; 1082.0 MB/s (4K per op)
with this change performance has increased to:
crc32c : 0.843 micros/op; 4633.6 MB/s (4K per op)
-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=148694935
Diffstat (limited to 'port')
-rw-r--r-- | port/port_example.h | 6 | ||||
-rw-r--r-- | port/port_posix.h | 2 | ||||
-rw-r--r-- | port/port_posix_sse.cc | 125 |
3 files changed, 133 insertions, 0 deletions
diff --git a/port/port_example.h b/port/port_example.h index ab9e489..97bd669 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -129,6 +129,12 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length, // The concatenation of all "data[0,n-1]" fragments is the heap profile. extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); +// Extend the CRC to include the first n bytes of buf. +// +// Returns zero if the CRC cannot be extended using acceleration, else returns +// the newly extended CRC value (which may also be zero). +uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); + } // namespace port } // namespace leveldb diff --git a/port/port_posix.h b/port/port_posix.h index 89fc222..d67ab68 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -148,6 +148,8 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; } +uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); + } // namespace port } // namespace leveldb diff --git a/port/port_posix_sse.cc b/port/port_posix_sse.cc new file mode 100644 index 0000000..57ec8fe --- /dev/null +++ b/port/port_posix_sse.cc @@ -0,0 +1,125 @@ +// Copyright 2016 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. +// +// In a separate source file to allow this accelerated CRC32C function to be +// compiled with the appropriate compiler flags to enable x86 SSE 4.2 +// instructions. + +#include <stdint.h> +#include <string.h> +#include "port/port.h" + +#if defined(LEVELDB_PLATFORM_POSIX_SSE) + +#if defined(_MSC_VER) +#include <intrin.h> +#elif defined(__GNUC__) && defined(__SSE4_2__) +#include <nmmintrin.h> +#include <cpuid.h> +#endif + +#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) + +namespace leveldb { +namespace port { + +#if defined(LEVELDB_PLATFORM_POSIX_SSE) + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + // SSE is x86 only, so ensured that |p| is always little-endian. + uint32_t word; + memcpy(&word, p, sizeof(word)); + return word; +} + +// Used to fetch a naturally-aligned 64-bit word in little endian byte-order +static inline uint64_t LE_LOAD64(const uint8_t *p) { + uint64_t dword; + memcpy(&dword, p, sizeof(dword)); + return dword; +} + +static inline bool HaveSSE42() { +#if defined(_MSC_VER) + int cpu_info[4]; + __cpuid(cpu_info, 1); + return (cpu_info[2] & (1 << 20)) != 0; +#elif defined(__GNUC__) + unsigned int eax, ebx, ecx, edx; + __get_cpuid(1, &eax, &ebx, &ecx, &edx); + return (ecx & (1 << 20)) != 0; +#else + return false; +#endif +} + +#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) + +// For further improvements see Intel publication at: +// http://download.intel.com/design/intarch/papers/323405.pdf +uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) { +#if !defined(LEVELDB_PLATFORM_POSIX_SSE) + return 0; +#else + static bool have = HaveSSE42(); + if (!have) { + return 0; + } + + const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); + const uint8_t *e = p + size; + uint32_t l = crc ^ 0xffffffffu; + +#define STEP1 do { \ + l = _mm_crc32_u8(l, *p++); \ +} while (0) +#define STEP4 do { \ + l = _mm_crc32_u32(l, LE_LOAD32(p)); \ + p += 4; \ +} while (0) +#define STEP8 do { \ + l = _mm_crc32_u64(l, LE_LOAD64(p)); \ + p += 8; \ +} while (0) + + if (size > 16) { + // Process unaligned bytes + for (unsigned int i = reinterpret_cast<uintptr_t>(p) % 8; i; --i) { + STEP1; + } + + // _mm_crc32_u64 is only available on x64. +#if defined(_M_X64) || defined(__x86_64__) + // Process 8 bytes at a time + while ((e-p) >= 8) { + STEP8; + } + // Process 4 bytes at a time + if ((e-p) >= 4) { + STEP4; + } +#else // !(defined(_M_X64) || defined(__x86_64__)) + // Process 4 bytes at a time + while ((e-p) >= 4) { + STEP4; + } +#endif // defined(_M_X64) || defined(__x86_64__) + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP8 +#undef STEP4 +#undef STEP1 + return l ^ 0xffffffffu; +#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) +} + +} // namespace port +} // namespace leveldb |