diff options
-rw-r--r-- | CONTRIBUTING.md | 33 | ||||
-rw-r--r-- | README.md | 26 | ||||
-rw-r--r-- | snappy-internal.h | 12 | ||||
-rw-r--r-- | snappy.cc | 48 |
4 files changed, 72 insertions, 47 deletions
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d0ce551..66a60d5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,30 +3,10 @@ We'd love to accept your patches and contributions to this project. There are just a few small guidelines you need to follow. -## Project Goals - -In addition to the aims listed at the top of the [README](README.md) Snappy -explicitly supports the following: - -1. C++11 -2. Clang (gcc and MSVC are best-effort). -3. Low level optimizations (e.g. assembly or equivalent intrinsics) for: - 1. [x86](https://en.wikipedia.org/wiki/X86) - 2. [x86-64](https://en.wikipedia.org/wiki/X86-64) - 3. ARMv7 (32-bit) - 4. ARMv8 (AArch64) -4. Supports only the Snappy compression scheme as described in - [format_description.txt](format_description.txt). -5. CMake for building - -Changes adding features or dependencies outside of the core area of focus listed -above might not be accepted. If in doubt post a message to the -[Snappy discussion mailing list](https://groups.google.com/g/snappy-compression). - ## Contributor License Agreement Contributions to this project must be accompanied by a Contributor License -Agreement. You (or your employer) retain the copyright to your contribution, +Agreement. You (or your employer) retain the copyright to your contribution; this simply gives us permission to use and redistribute your contributions as part of the project. Head over to <https://cla.developers.google.com/> to see your current agreements on file or to sign a new one. @@ -35,12 +15,17 @@ You generally only need to submit a CLA once, so if you've already submitted one (even if it was for a different project), you probably don't need to do it again. -## Code reviews +## Code Reviews All submissions, including submissions by project members, require review. We use GitHub pull requests for this purpose. Consult [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more information on using pull requests. -Please make sure that all the automated checks (CLA, AppVeyor, Travis) pass for -your pull requests. Pull requests whose checks fail may be ignored. +See [the README](README.md#contributing-to-the-snappy-project) for areas +where we are likely to accept external contributions. + +## Community Guidelines + +This project follows [Google's Open Source Community +Guidelines](https://opensource.google/conduct/). @@ -131,6 +131,32 @@ should provide a reasonably balanced starting point for benchmarking. (Note that baddata[1-3].snappy are not intended as benchmarks; they are used to verify correctness in the presence of corrupted data in the unit test.) +Contributing to the Snappy Project +================================== + +In addition to the aims listed at the top of the [README](README.md) Snappy +explicitly supports the following: + +1. C++11 +2. Clang (gcc and MSVC are best-effort). +3. Low level optimizations (e.g. assembly or equivalent intrinsics) for: + 1. [x86](https://en.wikipedia.org/wiki/X86) + 2. [x86-64](https://en.wikipedia.org/wiki/X86-64) + 3. ARMv7 (32-bit) + 4. ARMv8 (AArch64) +4. Supports only the Snappy compression scheme as described in + [format_description.txt](format_description.txt). +5. CMake for building + +Changes adding features or dependencies outside of the core area of focus listed +above might not be accepted. If in doubt post a message to the +[Snappy discussion mailing list](https://groups.google.com/g/snappy-compression). + +We are unlikely to accept contributions to the build configuration files, such +as `CMakeLists.txt`. We are focused on maintaining a build configuration that +allows us to test that the project works in a few supported configurations +inside Google. We are not currently interested in supporting other requirements, +such as different operating systems, compilers, or build systems. Contact ======= diff --git a/snappy-internal.h b/snappy-internal.h index ae7ab5a..e552ea0 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -230,8 +230,9 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1, uint64_t xorval = a1 ^ a2; int shift = Bits::FindLSBSetNonZero64(xorval); size_t matched_bytes = shift >> 3; + uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); #ifndef __x86_64__ - *data = UNALIGNED_LOAD64(s2 + matched_bytes); + a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2; #else // Ideally this would just be // @@ -242,13 +243,12 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1, // use a conditional move (it's tuned to cut data dependencies). In this // case there is a longer parallel chain anyway AND this will be fairly // unpredictable. - uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); asm("testl %k2, %k2\n\t" "cmovzq %1, %0\n\t" : "+r"(a2) : "r"(a3), "r"(xorval)); - *data = a2 >> (shift & (3 * 8)); #endif + *data = a2 >> (shift & (3 * 8)); return std::pair<size_t, bool>(matched_bytes, true); } else { matched = 8; @@ -270,16 +270,16 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1, uint64_t xorval = a1 ^ a2; int shift = Bits::FindLSBSetNonZero64(xorval); size_t matched_bytes = shift >> 3; + uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); #ifndef __x86_64__ - *data = UNALIGNED_LOAD64(s2 + matched_bytes); + a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2; #else - uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); asm("testl %k2, %k2\n\t" "cmovzq %1, %0\n\t" : "+r"(a2) : "r"(a3), "r"(xorval)); - *data = a2 >> (shift & (3 * 8)); #endif + *data = a2 >> (shift & (3 * 8)); matched += matched_bytes; assert(matched >= 8); return std::pair<size_t, bool>(matched, false); @@ -340,6 +340,7 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) { if (SNAPPY_PREDICT_TRUE(offset < 16)) { if (SNAPPY_PREDICT_FALSE(offset == 0)) return false; // Extend the pattern to the first 16 bytes. + // The simpler formulation of `dst[i - offset]` induces undefined behavior. for (int i = 0; i < 16; i++) dst[i] = (dst - offset)[i]; // Find a multiple of pattern >= 16. static std::array<uint8_t, 16> pattern_sizes = []() { @@ -983,22 +984,35 @@ inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) { return offset != 0; } -void MemCopy(char* dst, const uint8_t* src, size_t size) { - std::memcpy(dst, src, size); -} - -void MemCopy(ptrdiff_t dst, const uint8_t* src, size_t size) { - // TODO: Switch to [[maybe_unused]] when we can assume C++17. - (void)dst; - (void)src; - (void)size; -} - -void MemMove(char* dst, const void* src, size_t size) { - std::memmove(dst, src, size); +// Copies between size bytes and 64 bytes from src to dest. size cannot exceed +// 64. More than size bytes, but never exceeding 64, might be copied if doing +// so gives better performance. [src, src + size) must not overlap with +// [dst, dst + size), but [src, src + 64) may overlap with [dst, dst + 64). +void MemCopy64(char* dst, const void* src, size_t size) { + // Always copy this many bytes, test if we need to copy more. + constexpr int kShortMemCopy = 32; + // We're always allowed to copy 64 bytes, so if we exceed kShortMemCopy just + // copy 64 rather than the exact amount. + constexpr int kLongMemCopy = 64; + + assert(size <= kLongMemCopy); + assert(std::less_equal<const void*>()(static_cast<const char*>(src) + size, + dst) || + std::less_equal<const void*>()(dst + size, src)); + + // We know that src and dst are at least size bytes apart. However, because we + // might copy more than size bytes the copy still might overlap past size. + // E.g. if src and dst appear consecutively in memory (src + size == dst). + std::memmove(dst, src, kShortMemCopy); + // Profiling shows that nearly all copies are short. + if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { + std::memmove(dst + kShortMemCopy, + static_cast<const uint8_t*>(src) + kShortMemCopy, + kLongMemCopy - kShortMemCopy); + } } -void MemMove(ptrdiff_t dst, const void* src, size_t size) { +void MemCopy64(ptrdiff_t dst, const void* src, size_t size) { // TODO: Switch to [[maybe_unused]] when we can assume C++17. (void)dst; (void)src; @@ -1041,7 +1055,7 @@ size_t AdvanceToNextTagX86Optimized(const uint8_t** ip_p, size_t* tag) { size_t literal_len = *tag >> 2; size_t tag_type = *tag; bool is_literal; -#if defined(__GNUC__) && defined(__x86_64__) +#if defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(__x86_64__) // TODO clang misses the fact that the (c & 3) already correctly // sets the zero flag. asm("and $3, %k[tag_type]\n\t" @@ -1170,7 +1184,7 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless( // Due to the spurious offset in literals have this will trigger // at the start of a block when op is still smaller than 256. if (tag_type != 0) goto break_loop; - MemCopy(op_base + op, old_ip, 64); + MemCopy64(op_base + op, old_ip, len); op += len; continue; } @@ -1179,7 +1193,7 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless( // we need to copy from ip instead of from the stream. const void* from = tag_type ? reinterpret_cast<void*>(op_base + delta) : old_ip; - MemMove(op_base + op, from, 64); + MemCopy64(op_base + op, from, len); op += len; } } while (ip < ip_limit_min_slop && op < op_limit_min_slop); |