diff options
author | Snappy Team <no-reply@google.com> | 2020-12-03 03:15:10 +0000 |
---|---|---|
committer | Victor Costan <costan@google.com> | 2020-12-03 22:52:52 +0000 |
commit | 56c2c247d037ce4b274c93e4bb638662cb79378a (patch) | |
tree | cb7ba247d2a8f979a57d0894dc3b080d0d866668 | |
parent | a94be58e65dac5d676c219199b015cfc32d7ae6e (diff) | |
download | snappy-git-56c2c247d037ce4b274c93e4bb638662cb79378a.tar.gz |
Internal change
PiperOrigin-RevId: 345360683
-rw-r--r-- | snappy.cc | 280 |
1 files changed, 56 insertions, 224 deletions
@@ -74,7 +74,6 @@ #include <cstdio> #include <cstring> #include <string> -#include <utility> #include <vector> namespace snappy { @@ -179,16 +178,6 @@ void UnalignedCopy128(const void* src, void* dst) { std::memcpy(dst, tmp, 16); } -template <bool use_16bytes_chunk> -inline void ConditionalUnalignedCopy128(const char* src, char* dst) { - if (use_16bytes_chunk) { - UnalignedCopy128(src, dst); - } else { - UnalignedCopy64(src, dst); - UnalignedCopy64(src + 8, dst + 8); - } -} - // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used // for handling COPY operations where the input and output regions may overlap. // For example, suppose: @@ -216,164 +205,36 @@ inline char* IncrementalCopySlow(const char* src, char* op, #if SNAPPY_HAVE_SSSE3 -// Computes the bytes for shuffle control mask (please read comments on -// 'pattern_generation_masks' as well) for the given index_offset and -// pattern_size. For example, when the 'offset' is 6, it will generate a -// repeating pattern of size 6. So, the first 16 byte indexes will correspond to -// the pattern-bytes {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3} and the -// next 16 byte indexes will correspond to the pattern-bytes {4, 5, 0, 1, 2, 3, -// 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by -// calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and -// MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively. -template <size_t... indexes> -inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes( - int index_offset, int pattern_size, index_sequence<indexes...>) { - return {static_cast<char>((index_offset + indexes) % pattern_size)...}; -} - -// Computes the shuffle control mask bytes array for given pattern-sizes and -// returns an array. -template <size_t... pattern_sizes_minus_one> -inline constexpr std::array<std::array<char, sizeof(__m128i)>, - sizeof...(pattern_sizes_minus_one)> -MakePatternMaskBytesTable(int index_offset, - index_sequence<pattern_sizes_minus_one...>) { - return {MakePatternMaskBytes( - index_offset, pattern_sizes_minus_one + 1, - make_index_sequence</*indexes=*/sizeof(__m128i)>())...}; -} - -// This is an array of shuffle control masks that can be used as the source +// This is a table of shuffle control masks that can be used as the source // operand for PSHUFB to permute the contents of the destination XMM register // into a repeating byte pattern. -alignas(16) inline constexpr std::array<std::array<char, sizeof(__m128i)>, - 16> pattern_generation_masks = - MakePatternMaskBytesTable( - /*index_offset=*/0, - /*pattern_sizes_minus_one=*/make_index_sequence<16>()); - -// Similar to 'pattern_generation_masks', this table is used to "rotate" the -// pattern so that we can copy the *next 16 bytes* consistent with the pattern. -// Basically, pattern_reshuffle_masks is a continuation of -// pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as -// pattern_generation_masks for offsets 1, 2, 4, 8 and 16. -alignas(16) inline constexpr std::array<std::array<char, sizeof(__m128i)>, - 16> pattern_reshuffle_masks = - MakePatternMaskBytesTable( - /*index_offset=*/16, - /*pattern_sizes_minus_one=*/make_index_sequence<16>()); - -SNAPPY_ATTRIBUTE_ALWAYS_INLINE -static inline __m128i LoadPattern(const char* src, const size_t pattern_size) { - __m128i generation_mask = _mm_load_si128(reinterpret_cast<const __m128i*>( - pattern_generation_masks[pattern_size - 1].data())); - // Uninitialized bytes are masked out by the shuffle mask. - // TODO: remove annotation and macro defs once MSan is fixed. - SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(src + pattern_size, 16 - pattern_size); - return _mm_shuffle_epi8( - _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)), generation_mask); -} - -SNAPPY_ATTRIBUTE_ALWAYS_INLINE -static inline std::pair<__m128i /* pattern */, __m128i /* reshuffle_mask */> -LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) { - __m128i pattern = LoadPattern(src, pattern_size); - - // This mask will generate the next 16 bytes in-place. Doing so enables us to - // write data by at most 4 _mm_storeu_si128. - // - // For example, suppose pattern is: abcdefabcdefabcd - // Shuffling with this mask will generate: efabcdefabcdefab - // Shuffling again will generate: cdefabcdefabcdef - __m128i reshuffle_mask = _mm_load_si128(reinterpret_cast<const __m128i*>( - pattern_reshuffle_masks[pattern_size - 1].data())); - return {pattern, reshuffle_mask}; -} +alignas(16) const char pshufb_fill_patterns[7][16] = { + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}, + {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, + {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}, + {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0}, + {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3}, + {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1}, +}; -#endif // SNAPPY_HAVE_SSSE3 +// j * (16 / j) for all j from 0 to 7. 0 is not actually used. +const uint8_t pattern_size_table[8] = {0, 16, 16, 15, 16, 15, 12, 14}; -// Fallback for when we need to copy while extending the pattern, for example -// copying 10 bytes from 3 positions back abc -> abcabcabcabca. -// -// REQUIRES: [dst - offset, dst + 64) is a valid address range. -SNAPPY_ATTRIBUTE_ALWAYS_INLINE -static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) { -#if SNAPPY_HAVE_SSSE3 - if (SNAPPY_PREDICT_TRUE(offset <= 16)) { - switch (offset) { - case 0: - return false; - case 1: { - std::memset(dst, dst[-1], 64); - return true; - } - case 2: - case 4: - case 8: - case 16: { - __m128i pattern = LoadPattern(dst - offset, offset); - for (int i = 0; i < 4; i++) { - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern); - } - return true; - } - default: { - auto pattern_and_reshuffle_mask = - LoadPatternAndReshuffleMask(dst - offset, offset); - __m128i pattern = pattern_and_reshuffle_mask.first; - __m128i reshuffle_mask = pattern_and_reshuffle_mask.second; - for (int i = 0; i < 4; i++) { - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern); - pattern = _mm_shuffle_epi8(pattern, reshuffle_mask); - } - return true; - } - } - } -#else - if (SNAPPY_PREDICT_TRUE(offset < 16)) { - if (SNAPPY_PREDICT_FALSE(offset == 0)) return false; - // Extend the pattern to the first 16 bytes. - for (int i = 0; i < 16; i++) dst[i] = dst[i - offset]; - // Find a multiple of pattern >= 16. - static std::array<uint8_t, 16> pattern_sizes = []() { - std::array<uint8_t, 16> res; - for (int i = 1; i < 16; i++) res[i] = (16 / i + 1) * i; - return res; - }(); - offset = pattern_sizes[offset]; - for (int i = 1; i < 4; i++) { - std::memcpy(dst + i * 16, dst + i * 16 - offset, 16); - } - return true; - } #endif // SNAPPY_HAVE_SSSE3 - // Very rare. - for (int i = 0; i < 4; i++) { - std::memcpy(dst + i * 16, dst + i * 16 - offset, 16); - } - return true; -} - // Copy [src, src+(op_limit-op)) to [op, op_limit) but faster than // IncrementalCopySlow. buf_limit is the address past the end of the writable // region of the buffer. inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, char* const buf_limit) { -#if SNAPPY_HAVE_SSSE3 - constexpr int big_pattern_size_lower_bound = 16; -#else - constexpr int big_pattern_size_lower_bound = 8; -#endif - // Terminology: // // slop = buf_limit - op // pat = op - src // len = limit - op assert(src < op); - assert(op < op_limit); + assert(op <= op_limit); assert(op_limit <= buf_limit); // NOTE: The copy tags use 3 or 6 bits to store the copy length, so len <= 64. assert(op_limit - op <= 64); @@ -404,13 +265,11 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, // input. In general if we always predict len <= 16 it would be an ok // prediction. // - // In order to be fast we want a pattern >= 16 bytes (or 8 bytes in non-SSE) - // and an unrolled loop copying 1x 16 bytes (or 2x 8 bytes in non-SSE) at a - // time. + // In order to be fast we want a pattern >= 8 bytes and an unrolled loop + // copying 2x 8 bytes at a time. - // Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE) - // bytes. - if (pattern_size < big_pattern_size_lower_bound) { + // Handle the uncommon case where pattern is less than 8 bytes. + if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) { #if SNAPPY_HAVE_SSSE3 // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB // to permute the register's contents in-place into a repeating sequence of @@ -424,53 +283,22 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, // The non-SSE fallback implementation suffers from store-forwarding stalls // because its loads and stores partly overlap. By expanding the pattern // in-place, we avoid the penalty. - - // Typically, the op_limit is the gating factor so try to simplify the loop - // based on that. - if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) { - auto pattern_and_reshuffle_mask = - LoadPatternAndReshuffleMask(src, pattern_size); - __m128i pattern = pattern_and_reshuffle_mask.first; - __m128i reshuffle_mask = pattern_and_reshuffle_mask.second; - - // There is at least one, and at most four 16-byte blocks. Writing four - // conditionals instead of a loop allows FDO to layout the code with - // respect to the actual probabilities of each length. - // TODO: Replace with loop with trip count hint. - _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern); - - if (op + 16 < op_limit) { - pattern = _mm_shuffle_epi8(pattern, reshuffle_mask); - _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 16), pattern); - } - if (op + 32 < op_limit) { - pattern = _mm_shuffle_epi8(pattern, reshuffle_mask); - _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 32), pattern); - } - if (op + 48 < op_limit) { - pattern = _mm_shuffle_epi8(pattern, reshuffle_mask); - _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 48), pattern); - } - return op_limit; - } - char* const op_end = buf_limit - 15; - if (SNAPPY_PREDICT_TRUE(op < op_end)) { - auto pattern_and_reshuffle_mask = - LoadPatternAndReshuffleMask(src, pattern_size); - __m128i pattern = pattern_and_reshuffle_mask.first; - __m128i reshuffle_mask = pattern_and_reshuffle_mask.second; - - // This code path is relatively cold however so we save code size - // by avoiding unrolling and vectorizing. - // - // TODO: Remove pragma when when cold regions don't get - // vectorized or unrolled. -#pragma nounroll - do { + if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) { + const __m128i shuffle_mask = _mm_load_si128( + reinterpret_cast<const __m128i*>(pshufb_fill_patterns) + + pattern_size - 1); + const __m128i pattern = _mm_shuffle_epi8( + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask); + // Uninitialized bytes are masked out by the shuffle mask. + // TODO: remove annotation and macro defs once MSan is fixed. + SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern)); + pattern_size = pattern_size_table[pattern_size]; + char* op_end = std::min(op_limit, buf_limit - 15); + while (op < op_end) { _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern); - pattern = _mm_shuffle_epi8(pattern, reshuffle_mask); - op += 16; - } while (SNAPPY_PREDICT_TRUE(op < op_end)); + op += pattern_size; + } + if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit; } return IncrementalCopySlow(src, op, op_limit); #else // !SNAPPY_HAVE_SSSE3 @@ -492,30 +320,34 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, } #endif // SNAPPY_HAVE_SSSE3 } - assert(pattern_size >= big_pattern_size_lower_bound); - constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16; + assert(pattern_size >= 8); - // Copy 1x 16 bytes (or 2x 8 bytes in non-SSE) at a time. Because op - src can - // be < 16 in non-SSE, a single UnalignedCopy128 might overwrite data in op. - // UnalignedCopy64 is safe because expanding the pattern to at least 8 bytes - // guarantees that op - src >= 8. + // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single + // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe + // because expanding the pattern to at least 8 bytes guarantees that + // op - src >= 8. // // Typically, the op_limit is the gating factor so try to simplify the loop // based on that. - if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) { + if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) { // There is at least one, and at most four 16-byte blocks. Writing four // conditionals instead of a loop allows FDO to layout the code with respect // to the actual probabilities of each length. // TODO: Replace with loop with trip count hint. - ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op); + UnalignedCopy64(src, op); + UnalignedCopy64(src + 8, op + 8); + if (op + 16 < op_limit) { - ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 16, op + 16); + UnalignedCopy64(src + 16, op + 16); + UnalignedCopy64(src + 24, op + 24); } if (op + 32 < op_limit) { - ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 32, op + 32); + UnalignedCopy64(src + 32, op + 32); + UnalignedCopy64(src + 40, op + 40); } if (op + 48 < op_limit) { - ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 48, op + 48); + UnalignedCopy64(src + 48, op + 48); + UnalignedCopy64(src + 56, op + 56); } return op_limit; } @@ -526,9 +358,12 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, // // TODO: Remove pragma when when cold regions don't get vectorized // or unrolled. -#pragma nounroll +#ifdef __clang__ +#pragma clang loop unroll(disable) +#endif for (char* op_end = buf_limit - 16; op < op_end; op += 16, src += 16) { - ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op); + UnalignedCopy64(src, op); + UnalignedCopy64(src + 8, op + 8); } if (op >= op_limit) return op_limit; @@ -1059,10 +894,10 @@ std::pair<const uint8_t*, char*> DecompressBranchless( if (SNAPPY_PREDICT_FALSE(std::size_t(offset) < len)) { assert(tag_type != 0); // offset 0 is an error. - if (!Copy64BytesWithPatternExtension(op_base + op, offset)) { - break; - } - op += len; + if (SNAPPY_PREDICT_FALSE(offset == 0)) break; + op = IncrementalCopy(op_base + delta, op_base + op, op_base + op + len, + op_base + op_limit) - + op_base; continue; } @@ -1254,7 +1089,6 @@ class SnappyDecompressor { preload = LittleEndian::Load32(ip); const uint32_t trailer = ExtractLowBytes(preload, c & 3); const uint32_t length = entry & 0xff; - assert(length > 0); // copy_offset/256 is encoded in bits 8..10. By just fetching // those bits, we get copy_offset (since the bit-field starts at @@ -1625,7 +1459,6 @@ class SnappyIOVecWriter { if (to_copy > len) { to_copy = len; } - assert(to_copy > 0); IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset), curr_iov_output_, curr_iov_output_ + to_copy, @@ -1719,7 +1552,6 @@ class SnappyArrayWriter { SNAPPY_ATTRIBUTE_ALWAYS_INLINE inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) { - assert(len > 0); char* const op = *op_p; assert(op >= base_); char* const op_end = op + len; |