diff options
-rw-r--r-- | snappy-stubs-internal.h | 57 | ||||
-rw-r--r-- | snappy.cc | 16 |
2 files changed, 65 insertions, 8 deletions
diff --git a/snappy-stubs-internal.h b/snappy-stubs-internal.h index 12ba1ab..6033cdf 100644 --- a/snappy-stubs-internal.h +++ b/snappy-stubs-internal.h @@ -178,6 +178,8 @@ class LogMessageVoidify { // Potentially unaligned loads and stores. +// x86 and PowerPC can simply do these loads and stores native. + #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) #define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p)) @@ -188,6 +190,47 @@ class LogMessageVoidify { #define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val)) #define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val)) +// ARMv7 and newer support native unaligned accesses, but only of 16-bit +// and 32-bit values (not 64-bit); older versions either raise a fatal signal, +// do an unaligned read and rotate the words around a bit, or do the reads very +// slowly (trip through kernel mode). There's no simple #define that says just +// “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6 +// sub-architectures. +// +// This is a mess, but there's not much we can do about it. + +#elif defined(__arm__) && \ + !defined(__ARM_ARCH_5__) && \ + !defined(__ARM_ARCH_5T__) && \ + !defined(__ARM_ARCH_5TE__) && \ + !defined(__ARM_ARCH_5TEJ__) && \ + !defined(__ARM_ARCH_6__) && \ + !defined(__ARM_ARCH_6J__) && \ + !defined(__ARM_ARCH_6K__) && \ + !defined(__ARM_ARCH_6Z__) && \ + !defined(__ARM_ARCH_6ZK__) && \ + !defined(__ARM_ARCH_6T2__) + +#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p)) +#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p)) + +#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val)) +#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val)) + +// TODO(user): NEON supports unaligned 64-bit loads and stores. +// See if that would be more efficient on platforms supporting it, +// at least for copies. + +inline uint64 UNALIGNED_LOAD64(const void *p) { + uint64 t; + memcpy(&t, p, sizeof t); + return t; +} + +inline void UNALIGNED_STORE64(void *p, uint64 v) { + memcpy(p, &v, sizeof v); +} + #else // These functions are provided for architectures that don't support @@ -225,6 +268,20 @@ inline void UNALIGNED_STORE64(void *p, uint64 v) { #endif +// This can be more efficient than UNALIGNED_LOAD64 + UNALIGNED_STORE64 +// on some platforms, in particular ARM. +inline void UnalignedCopy64(const void *src, void *dst) { + if (sizeof(void *) == 8) { + UNALIGNED_STORE64(dst, UNALIGNED_LOAD64(src)); + } else { + const char *src_char = reinterpret_cast<const char *>(src); + char *dst_char = reinterpret_cast<char *>(dst); + + UNALIGNED_STORE32(dst_char, UNALIGNED_LOAD32(src_char)); + UNALIGNED_STORE32(dst_char + 4, UNALIGNED_LOAD32(src_char + 4)); + } +} + // The following guarantees declaration of the byte swap functions. #ifdef WORDS_BIGENDIAN @@ -140,12 +140,12 @@ const int kMaxIncrementCopyOverflow = 10; static inline void IncrementalCopyFastPath(const char* src, char* op, int len) { while (op - src < 8) { - UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src)); + UnalignedCopy64(src, op); len -= op - src; op += op - src; } while (len > 0) { - UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src)); + UnalignedCopy64(src, op); src += 8; op += 8; len -= 8; @@ -172,8 +172,8 @@ static inline char* EmitLiteral(char* op, // - The output will always have 32 spare bytes (see // MaxCompressedLength). if (allow_fast_path && len <= 16) { - UNALIGNED_STORE64(op, UNALIGNED_LOAD64(literal)); - UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(literal + 8)); + UnalignedCopy64(literal, op); + UnalignedCopy64(literal + 8, op + 8); return op + len; } } else { @@ -955,8 +955,8 @@ class SnappyArrayWriter { const size_t space_left = op_limit_ - op; if (len <= 16 && available >= 16 && space_left >= 16) { // Fast path, used for the majority (about 95%) of invocations. - UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip)); - UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8)); + UnalignedCopy64(ip, op); + UnalignedCopy64(ip + 8, op + 8); op_ = op + len; return true; } else { @@ -973,8 +973,8 @@ class SnappyArrayWriter { } if (len <= 16 && offset >= 8 && space_left >= 16) { // Fast path, used for the majority (70-80%) of dynamic invocations. - UNALIGNED_STORE64(op, UNALIGNED_LOAD64(op - offset)); - UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(op - offset + 8)); + UnalignedCopy64(op - offset, op); + UnalignedCopy64(op - offset + 8, op + 8); } else { if (space_left >= len + kMaxIncrementCopyOverflow) { IncrementalCopyFastPath(op - offset, op, len); |