4 files changed, 72 insertions, 47 deletions
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d0ce551..66a60d5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -3,30 +3,10 @@
 We'd love to accept your patches and contributions to this project. There are
 just a few small guidelines you need to follow.
 
-## Project Goals
-
-In addition to the aims listed at the top of the [README](README.md) Snappy
-explicitly supports the following:
-
-1. C++11
-2. Clang (gcc and MSVC are best-effort).
-3. Low level optimizations (e.g. assembly or equivalent intrinsics) for:
-  1. [x86](https://en.wikipedia.org/wiki/X86)
-  2. [x86-64](https://en.wikipedia.org/wiki/X86-64)
-  3. ARMv7 (32-bit)
-  4. ARMv8 (AArch64)
-4. Supports only the Snappy compression scheme as described in
-  [format_description.txt](format_description.txt).
-5. CMake for building
-
-Changes adding features or dependencies outside of the core area of focus listed
-above might not be accepted. If in doubt post a message to the
-[Snappy discussion mailing list](https://groups.google.com/g/snappy-compression).
-
 ## Contributor License Agreement
 
 Contributions to this project must be accompanied by a Contributor License
-Agreement. You (or your employer) retain the copyright to your contribution,
+Agreement. You (or your employer) retain the copyright to your contribution;
 this simply gives us permission to use and redistribute your contributions as
 part of the project. Head over to <https://cla.developers.google.com/> to see
 your current agreements on file or to sign a new one.
@@ -35,12 +15,17 @@ You generally only need to submit a CLA once, so if you've already submitted one
 (even if it was for a different project), you probably don't need to do it
 again.
 
-## Code reviews
+## Code Reviews
 
 All submissions, including submissions by project members, require review. We
 use GitHub pull requests for this purpose. Consult
 [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
 information on using pull requests.
 
-Please make sure that all the automated checks (CLA, AppVeyor, Travis) pass for
-your pull requests. Pull requests whose checks fail may be ignored.
+See [the README](README.md#contributing-to-the-snappy-project) for areas
+where we are likely to accept external contributions.
+
+## Community Guidelines
+
+This project follows [Google's Open Source Community
+Guidelines](https://opensource.google/conduct/).
diff --git a/README.md b/README.md
index c6f7a07..398be7d 100644
--- a/README.md
+++ b/README.md
@@ -131,6 +131,32 @@ should provide a reasonably balanced starting point for benchmarking. (Note that
 baddata[1-3].snappy are not intended as benchmarks; they are used to verify
 correctness in the presence of corrupted data in the unit test.)
 
+Contributing to the Snappy Project
+==================================
+
+In addition to the aims listed at the top of the [README](README.md) Snappy
+explicitly supports the following:
+
+1. C++11
+2. Clang (gcc and MSVC are best-effort).
+3. Low level optimizations (e.g. assembly or equivalent intrinsics) for:
+  1. [x86](https://en.wikipedia.org/wiki/X86)
+  2. [x86-64](https://en.wikipedia.org/wiki/X86-64)
+  3. ARMv7 (32-bit)
+  4. ARMv8 (AArch64)
+4. Supports only the Snappy compression scheme as described in
+  [format_description.txt](format_description.txt).
+5. CMake for building
+
+Changes adding features or dependencies outside of the core area of focus listed
+above might not be accepted. If in doubt post a message to the
+[Snappy discussion mailing list](https://groups.google.com/g/snappy-compression).
+
+We are unlikely to accept contributions to the build configuration files, such
+as `CMakeLists.txt`. We are focused on maintaining a build configuration that
+allows us to test that the project works in a few supported configurations
+inside Google. We are not currently interested in supporting other requirements,
+such as different operating systems, compilers, or build systems.
 
 Contact
 =======
diff --git a/snappy-internal.h b/snappy-internal.h
index ae7ab5a..e552ea0 100644
--- a/snappy-internal.h
+++ b/snappy-internal.h
@@ -230,8 +230,9 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
       uint64_t xorval = a1 ^ a2;
       int shift = Bits::FindLSBSetNonZero64(xorval);
       size_t matched_bytes = shift >> 3;
+      uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
 #ifndef __x86_64__
-      *data = UNALIGNED_LOAD64(s2 + matched_bytes);
+      a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
 #else
       // Ideally this would just be
       //
@@ -242,13 +243,12 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
       // use a conditional move (it's tuned to cut data dependencies). In this
       // case there is a longer parallel chain anyway AND this will be fairly
       // unpredictable.
-      uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
       asm("testl %k2, %k2\n\t"
           "cmovzq %1, %0\n\t"
           : "+r"(a2)
           : "r"(a3), "r"(xorval));
-      *data = a2 >> (shift & (3 * 8));
 #endif
+      *data = a2 >> (shift & (3 * 8));
       return std::pair<size_t, bool>(matched_bytes, true);
     } else {
       matched = 8;
@@ -270,16 +270,16 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
       uint64_t xorval = a1 ^ a2;
       int shift = Bits::FindLSBSetNonZero64(xorval);
       size_t matched_bytes = shift >> 3;
+      uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
 #ifndef __x86_64__
-      *data = UNALIGNED_LOAD64(s2 + matched_bytes);
+      a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
 #else
-      uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
       asm("testl %k2, %k2\n\t"
           "cmovzq %1, %0\n\t"
           : "+r"(a2)
           : "r"(a3), "r"(xorval));
-      *data = a2 >> (shift & (3 * 8));
 #endif
+      *data = a2 >> (shift & (3 * 8));
       matched += matched_bytes;
       assert(matched >= 8);
       return std::pair<size_t, bool>(matched, false);
diff --git a/snappy.cc b/snappy.cc
index bb9e0e5..6502cfd 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -340,6 +340,7 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
   if (SNAPPY_PREDICT_TRUE(offset < 16)) {
     if (SNAPPY_PREDICT_FALSE(offset == 0)) return false;
     // Extend the pattern to the first 16 bytes.
+    // The simpler formulation of `dst[i - offset]` induces undefined behavior.
     for (int i = 0; i < 16; i++) dst[i] = (dst - offset)[i];
     // Find a multiple of pattern >= 16.
     static std::array<uint8_t, 16> pattern_sizes = []() {
@@ -983,22 +984,35 @@ inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) {
   return offset != 0;
 }
 
-void MemCopy(char* dst, const uint8_t* src, size_t size) {
-  std::memcpy(dst, src, size);
-}
-
-void MemCopy(ptrdiff_t dst, const uint8_t* src, size_t size) {
-  // TODO: Switch to [[maybe_unused]] when we can assume C++17.
-  (void)dst;
-  (void)src;
-  (void)size;
-}
-
-void MemMove(char* dst, const void* src, size_t size) {
-  std::memmove(dst, src, size);
+// Copies between size bytes and 64 bytes from src to dest.  size cannot exceed
+// 64.  More than size bytes, but never exceeding 64, might be copied if doing
+// so gives better performance.  [src, src + size) must not overlap with
+// [dst, dst + size), but [src, src + 64) may overlap with [dst, dst + 64).
+void MemCopy64(char* dst, const void* src, size_t size) {
+  // Always copy this many bytes, test if we need to copy more.
+  constexpr int kShortMemCopy = 32;
+  // We're always allowed to copy 64 bytes, so if we exceed kShortMemCopy just
+  // copy 64 rather than the exact amount.
+  constexpr int kLongMemCopy = 64;
+
+  assert(size <= kLongMemCopy);
+  assert(std::less_equal<const void*>()(static_cast<const char*>(src) + size,
+                                        dst) ||
+         std::less_equal<const void*>()(dst + size, src));
+
+  // We know that src and dst are at least size bytes apart. However, because we
+  // might copy more than size bytes the copy still might overlap past size.
+  // E.g. if src and dst appear consecutively in memory (src + size == dst).
+  std::memmove(dst, src, kShortMemCopy);
+  // Profiling shows that nearly all copies are short.
+  if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
+    std::memmove(dst + kShortMemCopy,
+                 static_cast<const uint8_t*>(src) + kShortMemCopy,
+                 kLongMemCopy - kShortMemCopy);
+  }
 }
 
-void MemMove(ptrdiff_t dst, const void* src, size_t size) {
+void MemCopy64(ptrdiff_t dst, const void* src, size_t size) {
   // TODO: Switch to [[maybe_unused]] when we can assume C++17.
   (void)dst;
   (void)src;
@@ -1041,7 +1055,7 @@ size_t AdvanceToNextTagX86Optimized(const uint8_t** ip_p, size_t* tag) {
   size_t literal_len = *tag >> 2;
   size_t tag_type = *tag;
   bool is_literal;
-#if defined(__GNUC__) && defined(__x86_64__)
+#if defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(__x86_64__)
   // TODO clang misses the fact that the (c & 3) already correctly
   // sets the zero flag.
   asm("and $3, %k[tag_type]\n\t"
@@ -1170,7 +1184,7 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
           // Due to the spurious offset in literals have this will trigger
           // at the start of a block when op is still smaller than 256.
           if (tag_type != 0) goto break_loop;
-          MemCopy(op_base + op, old_ip, 64);
+          MemCopy64(op_base + op, old_ip, len);
           op += len;
           continue;
         }
@@ -1179,7 +1193,7 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
         // we need to copy from ip instead of from the stream.
         const void* from =
             tag_type ? reinterpret_cast<void*>(op_base + delta) : old_ip;
-        MemMove(op_base + op, from, 64);
+        MemCopy64(op_base + op, from, len);
         op += len;
       }
     } while (ip < ip_limit_min_slop && op < op_limit_min_slop);