summaryrefslogtreecommitdiff
path: root/libc
diff options
context:
space:
mode:
authorTue Ly <lntue@google.com>2023-05-12 15:53:59 -0400
committerTue Ly <lntue@google.com>2023-05-12 21:01:15 -0400
commit36b702901a0bff76c1f22926b7f4744f6b760659 (patch)
treead4d3c1cb33a4a90370fe35af4caa7776bb1ee35 /libc
parent4cc2010f0368b71316acd5895f9a016e7f34c5fb (diff)
downloadllvm-36b702901a0bff76c1f22926b7f4744f6b760659.tar.gz
[libc][math] Implement fast division / modulus for UInt / (uint32_t * 2^e).
This is to improve a performance bottleneck of printf for long double. Reviewed By: michaelrj Differential Revision: https://reviews.llvm.org/D150475
Diffstat (limited to 'libc')
-rw-r--r--libc/src/__support/UInt.h120
-rw-r--r--libc/test/src/__support/uint_test.cpp27
2 files changed, 147 insertions, 0 deletions
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index 8b273e368964..a702aaad827b 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -342,6 +342,126 @@ template <size_t Bits> struct UInt {
return remainder;
}
+ // Efficiently perform UInt / (x * 2^e), where x is a 32-bit unsigned integer,
+ // and return the remainder.
+ // The main idea is as follow:
+ // Let q = y / (x * 2^e) be the quotient, and
+ // r = y % (x * 2^e) be the remainder.
+ // First, notice that:
+ // r % (2^e) = y % (2^e),
+ // so we just need to focus on all the bits of y that is >= 2^e.
+ // To speed up the shift-and-add steps, we only use x as the divisor, and
+ // performing 32-bit shiftings instead of bit-by-bit shiftings.
+ // Since the remainder of each division step < x < 2^32, the computation of
+ // each step is now properly contained within uint64_t.
+ // And finally we perform some extra alignment steps for the remaining bits.
+ constexpr optional<UInt<Bits>> div_uint32_times_pow_2(uint32_t x, size_t e) {
+ UInt<Bits> remainder(0);
+
+ if (x == 0) {
+ return nullopt;
+ }
+ if (e >= Bits) {
+ remainder = *this;
+ *this = UInt<Bits>(0);
+ return remainder;
+ }
+
+ UInt<Bits> quotient(0);
+ uint64_t x64 = static_cast<uint64_t>(x);
+ // lower64 = smallest multiple of 64 that is >= e.
+ size_t lower64 = ((e >> 6) + ((e & 63) != 0)) << 6;
+ // lower_pos is the index of the closest 64-bit chunk >= 2^e.
+ size_t lower_pos = lower64 / 64;
+ // Keep track of current remainder mod x * 2^(32*i)
+ uint64_t rem = 0;
+ // pos is the index of the current 64-bit chunk that we are processing.
+ size_t pos = WORDCOUNT;
+
+ for (size_t q_pos = WORDCOUNT - lower_pos; q_pos > 0; --q_pos) {
+ // q_pos is 1 + the index of the current 64-bit chunk of the quotient
+ // being processed.
+ // Performing the division / modulus with divisor:
+ // x * 2^(64*q_pos - 32),
+ // i.e. using the upper 32-bit of the current 64-bit chunk.
+ rem <<= 32;
+ rem += val[--pos] >> 32;
+ uint64_t q_tmp = rem / x64;
+ rem %= x64;
+
+ // Performing the division / modulus with divisor:
+ // x * 2^(64*(q_pos - 1)),
+ // i.e. using the lower 32-bit of the current 64-bit chunk.
+ rem <<= 32;
+ rem += val[pos] & MASK32;
+ quotient.val[q_pos - 1] = (q_tmp << 32) + rem / x64;
+ rem %= x64;
+ }
+
+ // So far, what we have is:
+ // quotient = y / (x * 2^lower64), and
+ // rem = (y % (x * 2^lower64)) / 2^lower64.
+ // If (lower64 > e), we will need to perform an extra adjustment of the
+ // quotient and remainder, namely:
+ // y / (x * 2^e) = [ y / (x * 2^lower64) ] * 2^(lower64 - e) +
+ // + (rem * 2^(lower64 - e)) / x
+ // (y % (x * 2^e)) / 2^e = (rem * 2^(lower64 - e)) % x
+ size_t last_shift = lower64 - e;
+
+ if (last_shift > 0) {
+ // quotient * 2^(lower64 - e)
+ quotient <<= last_shift;
+ uint64_t q_tmp = 0;
+ uint64_t d = val[--pos];
+ if (last_shift >= 32) {
+ // The shifting (rem * 2^(lower64 - e)) might overflow uint64_t, so we
+ // perform a 32-bit shift first.
+ rem <<= 32;
+ rem += d >> 32;
+ d &= MASK32;
+ q_tmp = rem / x64;
+ rem %= x64;
+ last_shift -= 32;
+ } else {
+ // Only use the upper 32-bit of the current 64-bit chunk.
+ d >>= 32;
+ }
+
+ if (last_shift > 0) {
+ rem <<= 32;
+ rem += d;
+ q_tmp <<= last_shift;
+ x64 <<= 32 - last_shift;
+ q_tmp += rem / x64;
+ rem %= x64;
+ }
+
+ quotient.val[0] += q_tmp;
+
+ if (lower64 - e <= 32) {
+ // The remainder rem * 2^(lower64 - e) might overflow to the higher
+ // 64-bit chunk.
+ if (pos < WORDCOUNT - 1) {
+ remainder[pos + 1] = rem >> 32;
+ }
+ remainder[pos] = (rem << 32) + (val[pos] & MASK32);
+ } else {
+ remainder[pos] = rem;
+ }
+
+ } else {
+ remainder[pos] = rem;
+ }
+
+ // Set the remaining lower bits of the remainder.
+ for (; pos > 0; --pos) {
+ remainder[pos - 1] = val[pos - 1];
+ }
+
+ *this = quotient;
+ return remainder;
+ }
+
constexpr UInt<Bits> operator/(const UInt<Bits> &other) const {
UInt<Bits> result(*this);
result.div(other);
diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp
index fb266c25da26..77a6e6b2b39b 100644
--- a/libc/test/src/__support/uint_test.cpp
+++ b/libc/test/src/__support/uint_test.cpp
@@ -533,3 +533,30 @@ TEST(LlvmLibcUIntClassTest, ConstexprInitTests) {
constexpr LL_UInt128 sub = LL_UInt128(5) - LL_UInt128(4);
ASSERT_EQ(sub, LL_UInt128(1));
}
+
+#define TEST_QUICK_DIV_UINT32_POW2(x, e) \
+ do { \
+ LL_UInt320 y({0x8899aabbccddeeffULL, 0x0011223344556677ULL, \
+ 0x583715f4d3b29171ULL, 0xffeeddccbbaa9988ULL, \
+ 0x1f2f3f4f5f6f7f8fULL}); \
+ LL_UInt320 d = LL_UInt320(x); \
+ d <<= e; \
+ LL_UInt320 q1 = y / d; \
+ LL_UInt320 r1 = y % d; \
+ LL_UInt320 r2 = *y.div_uint32_times_pow_2(x, e); \
+ EXPECT_EQ(q1, y); \
+ EXPECT_EQ(r1, r2); \
+ } while (0)
+
+TEST(LlvmLibcUIntClassTest, DivUInt32TimesPow2Tests) {
+ for (size_t i = 0; i < 320; i += 32) {
+ TEST_QUICK_DIV_UINT32_POW2(1, i);
+ TEST_QUICK_DIV_UINT32_POW2(13151719, i);
+ }
+
+ TEST_QUICK_DIV_UINT32_POW2(1, 75);
+ TEST_QUICK_DIV_UINT32_POW2(1, 101);
+
+ TEST_QUICK_DIV_UINT32_POW2(1000000000, 75);
+ TEST_QUICK_DIV_UINT32_POW2(1000000000, 101);
+}