summaryrefslogtreecommitdiff
path: root/libc
diff options
context:
space:
mode:
authorGuillaume Chatelet <gchatelet@google.com>2023-05-10 08:34:05 +0000
committerGuillaume Chatelet <gchatelet@google.com>2023-05-10 08:42:07 +0000
commitf4a35492504d7a47afc8ea5b5dd9c437b7b66380 (patch)
tree721211bcf0af2e8540c249e4f82177a84439e4f9 /libc
parentf109b1016801e2b0dbee278f3c517057c0b1d441 (diff)
downloadllvm-f4a35492504d7a47afc8ea5b5dd9c437b7b66380.tar.gz
[libc] Add optimized memcpy for RISCV
This patch adds two versions of memcpy optimized for architectures where unaligned accesses are either illegal or extremely slow. It is currently enabled for RISCV 64 and RISCV 32 but it could be used for ARM 32 architectures as well. Here is the before / after output of `libc.benchmarks.memory_functions.opt_host --benchmark_filter=BM_Memcpy` on a quad core Linux starfive RISCV 64 board running at 1.5GHz. Before: ``` Run on (4 X 1500 MHz CPU s) CPU Caches: L1 Instruction 32 KiB (x4) L1 Data 32 KiB (x4) L2 Unified 2048 KiB (x1) ------------------------------------------------------------------------ Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------ BM_Memcpy/0/0 474 ns 474 ns 1483776 bytes_per_cycle=0.243492/s bytes_per_second=348.318M/s items_per_second=2.11097M/s __llvm_libc::memcpy,memcpy Google A BM_Memcpy/1/0 210 ns 209 ns 3649536 bytes_per_cycle=0.233819/s bytes_per_second=334.481M/s items_per_second=4.77519M/s __llvm_libc::memcpy,memcpy Google B BM_Memcpy/2/0 1814 ns 1814 ns 396288 bytes_per_cycle=0.247899/s bytes_per_second=354.622M/s items_per_second=551.402k/s __llvm_libc::memcpy,memcpy Google D BM_Memcpy/3/0 89.3 ns 89.2 ns 7459840 bytes_per_cycle=0.217415/s bytes_per_second=311.014M/s items_per_second=11.2071M/s __llvm_libc::memcpy,memcpy Google L BM_Memcpy/4/0 134 ns 134 ns 3815424 bytes_per_cycle=0.226584/s bytes_per_second=324.131M/s items_per_second=7.44567M/s __llvm_libc::memcpy,memcpy Google M BM_Memcpy/5/0 52.8 ns 52.6 ns 11001856 bytes_per_cycle=0.194893/s bytes_per_second=278.797M/s items_per_second=19.0284M/s __llvm_libc::memcpy,memcpy Google Q BM_Memcpy/6/0 180 ns 180 ns 4101120 bytes_per_cycle=0.231884/s bytes_per_second=331.713M/s items_per_second=5.55957M/s __llvm_libc::memcpy,memcpy Google S BM_Memcpy/7/0 195 ns 195 ns 3906560 bytes_per_cycle=0.232951/s bytes_per_second=333.239M/s items_per_second=5.1217M/s __llvm_libc::memcpy,memcpy Google U BM_Memcpy/8/0 152 ns 152 ns 4789248 bytes_per_cycle=0.227507/s bytes_per_second=325.452M/s items_per_second=6.58187M/s __llvm_libc::memcpy,memcpy Google W BM_Memcpy/9/0 6036 ns 6033 ns 118784 bytes_per_cycle=0.249158/s bytes_per_second=356.423M/s items_per_second=165.75k/s __llvm_libc::memcpy,uniform 384 to 4096 ``` After: ``` BM_Memcpy/0/0 126 ns 126 ns 5770240 bytes_per_cycle=1.04707/s bytes_per_second=1.46273G/s items_per_second=7.9385M/s __llvm_libc::memcpy,memcpy Google A BM_Memcpy/1/0 75.1 ns 75.0 ns 10204160 bytes_per_cycle=0.691143/s bytes_per_second=988.687M/s items_per_second=13.3289M/s __llvm_libc::memcpy,memcpy Google B BM_Memcpy/2/0 333 ns 333 ns 2174976 bytes_per_cycle=1.39297/s bytes_per_second=1.94596G/s items_per_second=3.00002M/s __llvm_libc::memcpy,memcpy Google D BM_Memcpy/3/0 49.6 ns 49.5 ns 16092160 bytes_per_cycle=0.710161/s bytes_per_second=1015.89M/s items_per_second=20.1844M/s __llvm_libc::memcpy,memcpy Google L BM_Memcpy/4/0 57.7 ns 57.7 ns 11213824 bytes_per_cycle=0.561557/s bytes_per_second=803.314M/s items_per_second=17.3228M/s __llvm_libc::memcpy,memcpy Google M BM_Memcpy/5/0 48.0 ns 47.9 ns 16437248 bytes_per_cycle=0.346708/s bytes_per_second=495.97M/s items_per_second=20.8571M/s __llvm_libc::memcpy,memcpy Google Q BM_Memcpy/6/0 67.5 ns 67.5 ns 10616832 bytes_per_cycle=0.614173/s bytes_per_second=878.582M/s items_per_second=14.8142M/s __llvm_libc::memcpy,memcpy Google S BM_Memcpy/7/0 84.7 ns 84.6 ns 10480640 bytes_per_cycle=0.819077/s bytes_per_second=1.14424G/s items_per_second=11.8174M/s __llvm_libc::memcpy,memcpy Google U BM_Memcpy/8/0 61.7 ns 61.6 ns 11191296 bytes_per_cycle=0.550078/s bytes_per_second=786.893M/s items_per_second=16.2279M/s __llvm_libc::memcpy,memcpy Google W BM_Memcpy/9/0 981 ns 981 ns 703488 bytes_per_cycle=1.52333/s bytes_per_second=2.12807G/s items_per_second=1019.81k/s __llvm_libc::memcpy,uniform 384 to 4096 ``` It is not as good as glibc for now so there's room for improvement. I suspect a path pumping 16 bytes at once given the doubled numbers for large copies. ``` BM_Memcpy/0/1 146 ns 82.5 ns 8576000 bytes_per_cycle=1.35236/s bytes_per_second=1.88922G/s items_per_second=12.1169M/s glibc memcpy,memcpy Google A BM_Memcpy/1/1 112 ns 63.7 ns 10634240 bytes_per_cycle=0.628018/s bytes_per_second=898.387M/s items_per_second=15.702M/s glibc memcpy,memcpy Google B BM_Memcpy/2/1 315 ns 180 ns 4079616 bytes_per_cycle=2.65229/s bytes_per_second=3.7052G/s items_per_second=5.54764M/s glibc memcpy,memcpy Google D BM_Memcpy/3/1 85.3 ns 43.1 ns 15854592 bytes_per_cycle=0.774164/s bytes_per_second=1107.45M/s items_per_second=23.2249M/s glibc memcpy,memcpy Google L BM_Memcpy/4/1 105 ns 54.3 ns 13427712 bytes_per_cycle=0.7793/s bytes_per_second=1114.8M/s items_per_second=18.4109M/s glibc memcpy,memcpy Google M BM_Memcpy/5/1 77.1 ns 43.2 ns 16476160 bytes_per_cycle=0.279808/s bytes_per_second=400.269M/s items_per_second=23.1428M/s glibc memcpy,memcpy Google Q BM_Memcpy/6/1 112 ns 62.7 ns 11236352 bytes_per_cycle=0.676078/s bytes_per_second=967.137M/s items_per_second=15.9387M/s glibc memcpy,memcpy Google S BM_Memcpy/7/1 131 ns 65.5 ns 11751424 bytes_per_cycle=0.965616/s bytes_per_second=1.34895G/s items_per_second=15.2762M/s glibc memcpy,memcpy Google U BM_Memcpy/8/1 104 ns 55.0 ns 12314624 bytes_per_cycle=0.583336/s bytes_per_second=834.468M/s items_per_second=18.1937M/s glibc memcpy,memcpy Google W BM_Memcpy/9/1 932 ns 466 ns 1480704 bytes_per_cycle=3.17342/s bytes_per_second=4.43321G/s items_per_second=2.14679M/s glibc memcpy,uniform 384 to 4096 ``` Reviewed By: sivachandra Differential Revision: https://reviews.llvm.org/D150202
Diffstat (limited to 'libc')
-rw-r--r--libc/src/__support/macros/properties/architectures.h4
-rw-r--r--libc/src/string/memory_utils/CMakeLists.txt1
-rw-r--r--libc/src/string/memory_utils/memcpy_implementations.h67
-rw-r--r--libc/src/string/memory_utils/utils.h87
-rw-r--r--libc/test/src/string/memory_utils/utils_test.cpp40
5 files changed, 191 insertions, 8 deletions
diff --git a/libc/src/__support/macros/properties/architectures.h b/libc/src/__support/macros/properties/architectures.h
index 1247fd6ef5cb..66bb6fb68a64 100644
--- a/libc/src/__support/macros/properties/architectures.h
+++ b/libc/src/__support/macros/properties/architectures.h
@@ -49,6 +49,10 @@
#define LIBC_TARGET_ARCH_IS_RISCV64
#endif
+#if defined(__riscv) && (__riscv_xlen == 32)
+#define LIBC_TARGET_ARCH_IS_RISCV32
+#endif
+
#if (defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_ARM))
#define LIBC_TARGET_ARCH_IS_ANY_ARM
#endif
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 31335227f4ab..7bb0e960ee13 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -18,6 +18,7 @@ add_header_library(
x86_64/memcmp_implementations.h
x86_64/memcpy_implementations.h
DEPS
+ libc.src.__support.common
libc.src.__support.CPP.bit
libc.src.__support.CPP.cstddef
libc.src.__support.CPP.type_traits
diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h
index b1b60ff590d5..b22606781703 100644
--- a/libc/src/string/memory_utils/memcpy_implementations.h
+++ b/libc/src/string/memory_utils/memcpy_implementations.h
@@ -26,24 +26,79 @@
namespace __llvm_libc {
[[maybe_unused]] LIBC_INLINE void
-inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src,
- size_t count) {
+inline_memcpy_byte_per_byte(Ptr dst, CPtr src, size_t offset, size_t count) {
LIBC_LOOP_NOUNROLL
- for (size_t offset = 0; offset < count; ++offset)
- builtin::Memcpy<1>::block(dst + offset, src + offset);
+ for (; offset < count; ++offset)
+ dst[offset] = src[offset];
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_aligned_access_32bit(Ptr __restrict dst, CPtr __restrict src,
+ size_t count) {
+ constexpr size_t kAlign = sizeof(uint32_t);
+ if (count <= 2 * kAlign)
+ return inline_memcpy_byte_per_byte(dst, src, 0, count);
+ size_t bytes_to_dst_align = distance_to_align_up<kAlign>(dst);
+ inline_memcpy_byte_per_byte(dst, src, 0, bytes_to_dst_align);
+ size_t offset = bytes_to_dst_align;
+ size_t src_alignment = distance_to_align_down<kAlign>(src + offset);
+ for (; offset < count - kAlign; offset += kAlign) {
+ uint32_t value;
+ if (src_alignment == 0)
+ value = load32_aligned<uint32_t>(src, offset);
+ else if (src_alignment == 2)
+ value = load32_aligned<uint16_t, uint16_t>(src, offset);
+ else
+ value = load32_aligned<uint8_t, uint16_t, uint8_t>(src, offset);
+ store32_aligned<uint32_t>(value, dst, offset);
+ }
+ // remainder
+ inline_memcpy_byte_per_byte(dst, src, offset, count);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_aligned_access_64bit(Ptr __restrict dst, CPtr __restrict src,
+ size_t count) {
+ constexpr size_t kAlign = sizeof(uint64_t);
+ if (count <= 2 * kAlign)
+ return inline_memcpy_byte_per_byte(dst, src, 0, count);
+ size_t bytes_to_dst_align = distance_to_align_up<kAlign>(dst);
+ inline_memcpy_byte_per_byte(dst, src, 0, bytes_to_dst_align);
+ size_t offset = bytes_to_dst_align;
+ size_t src_alignment = distance_to_align_down<kAlign>(src + offset);
+ for (; offset < count - kAlign; offset += kAlign) {
+ uint64_t value;
+ if (src_alignment == 0)
+ value = load64_aligned<uint64_t>(src, offset);
+ else if (src_alignment == 4)
+ value = load64_aligned<uint32_t, uint32_t>(src, offset);
+ else if (src_alignment == 2)
+ value =
+ load64_aligned<uint16_t, uint16_t, uint16_t, uint16_t>(src, offset);
+ else
+ value = load64_aligned<uint8_t, uint16_t, uint16_t, uint16_t, uint8_t>(
+ src, offset);
+ store64_aligned<uint64_t>(value, dst, offset);
+ }
+ // remainder
+ inline_memcpy_byte_per_byte(dst, src, offset, count);
}
LIBC_INLINE void inline_memcpy(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
using namespace __llvm_libc::builtin;
#if defined(LIBC_COPT_MEMCPY_USE_EMBEDDED_TINY)
- return inline_memcpy_embedded_tiny(dst, src, count);
+ return inline_memcpy_byte_per_byte(dst, src, 0, count);
#elif defined(LIBC_TARGET_ARCH_IS_X86)
return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count);
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
return inline_memcpy_aarch64(dst, src, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV64)
+ return inline_memcpy_aligned_access_64bit(dst, src, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
+ return inline_memcpy_aligned_access_32bit(dst, src, count);
#else
- return inline_memcpy_embedded_tiny(dst, src, count);
+ return inline_memcpy_byte_per_byte(dst, src, 0, count);
#endif
}
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 5c7b360ad108..ab33331847af 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -12,8 +12,9 @@
#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/cstddef.h"
#include "src/__support/CPP/type_traits.h"
-#include "src/__support/macros/attributes.h" //LIBC_INLINE
-#include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN
+#include "src/__support/endian.h"
+#include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN
#include <stddef.h> // size_t
#include <stdint.h> // intptr_t / uintptr_t
@@ -97,8 +98,15 @@ LIBC_INLINE void memcpy_inline(void *__restrict dst,
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
__builtin_memcpy_inline(dst, src, Size);
#else
+// In memory functions `memcpy_inline` is instantiated several times with
+// different value of the Size parameter. This doesn't play well with GCC's
+// Value Range Analysis that wrongly detects out of bounds accesses. We disable
+// the 'array-bounds' warning for the purpose of this function.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
for (size_t i = 0; i < Size; ++i)
static_cast<char *>(dst)[i] = static_cast<const char *>(src)[i];
+#pragma GCC diagnostic pop
#endif
}
@@ -153,6 +161,81 @@ template <typename T> LIBC_INLINE void store(Ptr ptr, T value) {
memcpy_inline<sizeof(T)>(ptr, &value);
}
+// On architectures that do not allow for unaligned access we perform several
+// aligned accesses and recombine them through shifts and logicals operations.
+// For instance, if we know that the pointer is 2-byte aligned we can decompose
+// a 64-bit operation into four 16-bit operations.
+
+// Loads a 'ValueType' by decomposing it into several loads that are assumed to
+// be aligned.
+// e.g. load_aligned<uint32_t, uint16_t, uint16_t>(ptr);
+template <typename ValueType, typename T, typename... TS>
+ValueType load_aligned(CPtr src) {
+ static_assert(sizeof(ValueType) >= (sizeof(T) + ... + sizeof(TS)));
+ const ValueType value = load<T>(assume_aligned<sizeof(T)>(src));
+ if constexpr (sizeof...(TS) > 0) {
+ constexpr size_t shift = sizeof(T) * 8;
+ const ValueType next = load_aligned<ValueType, TS...>(src + sizeof(T));
+ if constexpr (Endian::IS_LITTLE)
+ return value | (next << shift);
+ else if constexpr (Endian::IS_BIG)
+ return (value << shift) | next;
+ else
+ deferred_static_assert("Invalid endianness");
+ } else {
+ return value;
+ }
+}
+
+// Alias for loading a 'uint32_t'.
+template <typename T, typename... TS>
+auto load32_aligned(CPtr src, size_t offset) {
+ static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint32_t));
+ return load_aligned<uint32_t, T, TS...>(src + offset);
+}
+
+// Alias for loading a 'uint64_t'.
+template <typename T, typename... TS>
+auto load64_aligned(CPtr src, size_t offset) {
+ static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint64_t));
+ return load_aligned<uint64_t, T, TS...>(src + offset);
+}
+
+// Stores a 'ValueType' by decomposing it into several stores that are assumed
+// to be aligned.
+// e.g. store_aligned<uint32_t, uint16_t, uint16_t>(value, ptr);
+template <typename ValueType, typename T, typename... TS>
+void store_aligned(ValueType value, Ptr dst) {
+ static_assert(sizeof(ValueType) >= (sizeof(T) + ... + sizeof(TS)));
+ constexpr size_t shift = sizeof(T) * 8;
+ if constexpr (Endian::IS_LITTLE) {
+ store<T>(assume_aligned<sizeof(T)>(dst), value & ~T(0));
+ if constexpr (sizeof...(TS) > 0)
+ store_aligned<ValueType, TS...>(value >> shift, dst + sizeof(T));
+ } else if constexpr (Endian::IS_BIG) {
+ constexpr size_t OFFSET = (0 + ... + sizeof(TS));
+ store<T>(assume_aligned<sizeof(T)>(dst + OFFSET), value & ~T(0));
+ if constexpr (sizeof...(TS) > 0)
+ store_aligned<ValueType, TS...>(value >> shift, dst);
+ } else {
+ deferred_static_assert("Invalid endianness");
+ }
+}
+
+// Alias for storing a 'uint32_t'.
+template <typename T, typename... TS>
+void store32_aligned(uint32_t value, Ptr dst, size_t offset) {
+ static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint32_t));
+ store_aligned<uint32_t, T, TS...>(value, dst + offset);
+}
+
+// Alias for storing a 'uint64_t'.
+template <typename T, typename... TS>
+void store64_aligned(uint64_t value, Ptr dst, size_t offset) {
+ static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint64_t));
+ store_aligned<uint64_t, T, TS...>(value, dst + offset);
+}
+
// Advances the pointers p1 and p2 by offset bytes and decrease count by the
// same amount.
template <typename T1, typename T2>
diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp
index 3f8ce5da72aa..37d61d84c53b 100644
--- a/libc/test/src/string/memory_utils/utils_test.cpp
+++ b/libc/test/src/string/memory_utils/utils_test.cpp
@@ -144,4 +144,44 @@ TEST(LlvmLibcUtilsTest, Align2) {
}
}
+TEST(LlvmLibcUtilsTest, LoadStoreAligned) {
+ const uint64_t init = 0xDEAD'C0DE'BEEF'F00D;
+ CPtr const src = reinterpret_cast<CPtr>(&init);
+ uint64_t store;
+ Ptr const dst = reinterpret_cast<Ptr>(&store);
+
+ using LoadFun = uint64_t (*)(CPtr);
+ using StoreFun = void (*)(uint64_t, Ptr);
+
+ {
+ LoadFun ld = load_aligned<uint64_t, uint64_t>;
+ StoreFun st = store_aligned<uint64_t, uint64_t>;
+ const uint64_t loaded = ld(src);
+ EXPECT_EQ(init, loaded);
+ store = 0;
+ st(init, dst);
+ EXPECT_EQ(init, store);
+ }
+
+ {
+ LoadFun ld = load_aligned<uint64_t, uint32_t, uint32_t>;
+ StoreFun st = store_aligned<uint64_t, uint32_t, uint32_t>;
+ const uint64_t loaded = ld(src);
+ EXPECT_EQ(init, loaded);
+ store = 0;
+ st(init, dst);
+ EXPECT_EQ(init, store);
+ }
+
+ {
+ LoadFun ld = load_aligned<uint64_t, uint32_t, uint16_t, uint8_t, uint8_t>;
+ StoreFun st = store_aligned<uint64_t, uint32_t, uint16_t, uint8_t, uint8_t>;
+ const uint64_t loaded = ld(src);
+ EXPECT_EQ(init, loaded);
+ store = 0;
+ st(init, dst);
+ EXPECT_EQ(init, store);
+ }
+}
+
} // namespace __llvm_libc