summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Green <david.green@arm.com>2022-01-17 11:41:53 +0000
committerDavid Green <david.green@arm.com>2022-01-19 07:14:46 +0000
commit6c6e890ef961260b60f9f9a2051ac4618dbd06bd (patch)
tree3a0f1caa72ba90717ec95e197a3e6fb8f1cdad29
parent8b07ff196aa007e9b073aa4f062dbb23443594ec (diff)
downloadsnappy-git-6c6e890ef961260b60f9f9a2051ac4618dbd06bd.tar.gz
Change LittleEndian loads/stores to use memcpy
The existing code uses a series of 8bit loads with shifts and ors to emulate an (unaligned) load of a larger type. These are then expected to become single loads in the compiler, producing optimal assembly. Whilst this is true it happens very late in the compiler, meaning that throughout most of the pipeline it is treated (and cost-modelled) as multiple loads, shifts and ors. This can make the compiler make poor decisions (such as not unrolling loops that should be), or to break up the pattern before it is turned into a single load. For example the loops in CompressFragment do not get unrolled as expected due to a higher cost than the unroll threshold in clang. Instead this patch uses a more conventional methods of loading unaligned data, using a memcpy directly which the compiler will be able to deal with much more straight forwardly, modelling it as a single unaligned load. The old code is left as-is for big-endian systems. This helps improve the performance of the BM_ZFlat benchmarks by up to 10-15% on an Arm Neoverse N1. Change-Id: I986f845ebd0a0806d052d2be3e4dbcbee91713d7
-rw-r--r--snappy-stubs-internal.h48
1 files changed, 36 insertions, 12 deletions
diff --git a/snappy-stubs-internal.h b/snappy-stubs-internal.h
index 7d43c92..8e28b3d 100644
--- a/snappy-stubs-internal.h
+++ b/snappy-stubs-internal.h
@@ -171,27 +171,37 @@ class LittleEndian {
public:
// Functions to do unaligned loads and stores in little-endian order.
static inline uint16_t Load16(const void *ptr) {
- const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
-
// Compiles to a single mov/str on recent clang and gcc.
+#if SNAPPY_IS_BIG_ENDIAN
+ const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
return (static_cast<uint16_t>(buffer[0])) |
(static_cast<uint16_t>(buffer[1]) << 8);
+#else
+ uint16_t x;
+ memcpy(&x, ptr, 2);
+ return x;
+#endif
}
static inline uint32_t Load32(const void *ptr) {
- const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
-
// Compiles to a single mov/str on recent clang and gcc.
+#if SNAPPY_IS_BIG_ENDIAN
+ const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
return (static_cast<uint32_t>(buffer[0])) |
(static_cast<uint32_t>(buffer[1]) << 8) |
(static_cast<uint32_t>(buffer[2]) << 16) |
(static_cast<uint32_t>(buffer[3]) << 24);
+#else
+ uint32_t x;
+ memcpy(&x, ptr, 4);
+ return x;
+#endif
}
static inline uint64_t Load64(const void *ptr) {
- const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
-
// Compiles to a single mov/str on recent clang and gcc.
+#if SNAPPY_IS_BIG_ENDIAN
+ const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
return (static_cast<uint64_t>(buffer[0])) |
(static_cast<uint64_t>(buffer[1]) << 8) |
(static_cast<uint64_t>(buffer[2]) << 16) |
@@ -200,30 +210,41 @@ class LittleEndian {
(static_cast<uint64_t>(buffer[5]) << 40) |
(static_cast<uint64_t>(buffer[6]) << 48) |
(static_cast<uint64_t>(buffer[7]) << 56);
+#else
+ uint64_t x;
+ memcpy(&x, ptr, 8);
+ return x;
+#endif
}
static inline void Store16(void *dst, uint16_t value) {
- uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
-
// Compiles to a single mov/str on recent clang and gcc.
+#if SNAPPY_IS_BIG_ENDIAN
+ uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
buffer[0] = static_cast<uint8_t>(value);
buffer[1] = static_cast<uint8_t>(value >> 8);
+#else
+ memcpy(dst, &value, 2);
+#endif
}
static void Store32(void *dst, uint32_t value) {
- uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
-
// Compiles to a single mov/str on recent clang and gcc.
+#if SNAPPY_IS_BIG_ENDIAN
+ uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
buffer[0] = static_cast<uint8_t>(value);
buffer[1] = static_cast<uint8_t>(value >> 8);
buffer[2] = static_cast<uint8_t>(value >> 16);
buffer[3] = static_cast<uint8_t>(value >> 24);
+#else
+ memcpy(dst, &value, 4);
+#endif
}
static void Store64(void* dst, uint64_t value) {
- uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
-
// Compiles to a single mov/str on recent clang and gcc.
+#if SNAPPY_IS_BIG_ENDIAN
+ uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
buffer[0] = static_cast<uint8_t>(value);
buffer[1] = static_cast<uint8_t>(value >> 8);
buffer[2] = static_cast<uint8_t>(value >> 16);
@@ -232,6 +253,9 @@ class LittleEndian {
buffer[5] = static_cast<uint8_t>(value >> 40);
buffer[6] = static_cast<uint8_t>(value >> 48);
buffer[7] = static_cast<uint8_t>(value >> 56);
+#else
+ memcpy(dst, &value, 8);
+#endif
}
static inline constexpr bool IsLittleEndian() {