[libc][math] Implement fast division / modulus for UInt / (uint32_t * 2^e).

This is to improve a performance bottleneck of printf for long double. Reviewed By: michaelrj Differential Revision: https://reviews.llvm.org/D150475
author: Tue Ly <lntue@google.com> 2023-05-12 15:53:59 -0400
committer: Tue Ly <lntue@google.com> 2023-05-12 21:01:15 -0400
commit: 36b702901a0bff76c1f22926b7f4744f6b760659 (patch)
tree: ad4d3c1cb33a4a90370fe35af4caa7776bb1ee35 /libc
parent: 4cc2010f0368b71316acd5895f9a016e7f34c5fb (diff)
download: llvm-36b702901a0bff76c1f22926b7f4744f6b760659.tar.gz
2 files changed, 147 insertions, 0 deletions
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index 8b273e368964..a702aaad827b 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -342,6 +342,126 @@ template <size_t Bits> struct UInt {
     return remainder;
   }
 
+  // Efficiently perform UInt / (x * 2^e), where x is a 32-bit unsigned integer,
+  // and return the remainder.
+  // The main idea is as follow:
+  //   Let q = y / (x * 2^e) be the quotient, and
+  //       r = y % (x * 2^e) be the remainder.
+  //   First, notice that:
+  //     r % (2^e) = y % (2^e),
+  // so we just need to focus on all the bits of y that is >= 2^e.
+  //   To speed up the shift-and-add steps, we only use x as the divisor, and
+  // performing 32-bit shiftings instead of bit-by-bit shiftings.
+  //   Since the remainder of each division step < x < 2^32, the computation of
+  // each step is now properly contained within uint64_t.
+  //   And finally we perform some extra alignment steps for the remaining bits.
+  constexpr optional<UInt<Bits>> div_uint32_times_pow_2(uint32_t x, size_t e) {
+    UInt<Bits> remainder(0);
+
+    if (x == 0) {
+      return nullopt;
+    }
+    if (e >= Bits) {
+      remainder = *this;
+      *this = UInt<Bits>(0);
+      return remainder;
+    }
+
+    UInt<Bits> quotient(0);
+    uint64_t x64 = static_cast<uint64_t>(x);
+    // lower64 = smallest multiple of 64 that is >= e.
+    size_t lower64 = ((e >> 6) + ((e & 63) != 0)) << 6;
+    // lower_pos is the index of the closest 64-bit chunk >= 2^e.
+    size_t lower_pos = lower64 / 64;
+    // Keep track of current remainder mod x * 2^(32*i)
+    uint64_t rem = 0;
+    // pos is the index of the current 64-bit chunk that we are processing.
+    size_t pos = WORDCOUNT;
+
+    for (size_t q_pos = WORDCOUNT - lower_pos; q_pos > 0; --q_pos) {
+      // q_pos is 1 + the index of the current 64-bit chunk of the quotient
+      // being processed.
+      // Performing the division / modulus with divisor:
+      //   x * 2^(64*q_pos - 32),
+      // i.e. using the upper 32-bit of the current 64-bit chunk.
+      rem <<= 32;
+      rem += val[--pos] >> 32;
+      uint64_t q_tmp = rem / x64;
+      rem %= x64;
+
+      // Performing the division / modulus with divisor:
+      //   x * 2^(64*(q_pos - 1)),
+      // i.e. using the lower 32-bit of the current 64-bit chunk.
+      rem <<= 32;
+      rem += val[pos] & MASK32;
+      quotient.val[q_pos - 1] = (q_tmp << 32) + rem / x64;
+      rem %= x64;
+    }
+
+    // So far, what we have is:
+    //   quotient = y / (x * 2^lower64), and
+    //        rem = (y % (x * 2^lower64)) / 2^lower64.
+    // If (lower64 > e), we will need to perform an extra adjustment of the
+    // quotient and remainder, namely:
+    //   y / (x * 2^e) = [ y / (x * 2^lower64) ] * 2^(lower64 - e) +
+    //                   + (rem * 2^(lower64 - e)) / x
+    //   (y % (x * 2^e)) / 2^e = (rem * 2^(lower64 - e)) % x
+    size_t last_shift = lower64 - e;
+
+    if (last_shift > 0) {
+      // quotient * 2^(lower64 - e)
+      quotient <<= last_shift;
+      uint64_t q_tmp = 0;
+      uint64_t d = val[--pos];
+      if (last_shift >= 32) {
+        // The shifting (rem * 2^(lower64 - e)) might overflow uint64_t, so we
+        // perform a 32-bit shift first.
+        rem <<= 32;
+        rem += d >> 32;
+        d &= MASK32;
+        q_tmp = rem / x64;
+        rem %= x64;
+        last_shift -= 32;
+      } else {
+        // Only use the upper 32-bit of the current 64-bit chunk.
+        d >>= 32;
+      }
+
+      if (last_shift > 0) {
+        rem <<= 32;
+        rem += d;
+        q_tmp <<= last_shift;
+        x64 <<= 32 - last_shift;
+        q_tmp += rem / x64;
+        rem %= x64;
+      }
+
+      quotient.val[0] += q_tmp;
+
+      if (lower64 - e <= 32) {
+        // The remainder rem * 2^(lower64 - e) might overflow to the higher
+        // 64-bit chunk.
+        if (pos < WORDCOUNT - 1) {
+          remainder[pos + 1] = rem >> 32;
+        }
+        remainder[pos] = (rem << 32) + (val[pos] & MASK32);
+      } else {
+        remainder[pos] = rem;
+      }
+
+    } else {
+      remainder[pos] = rem;
+    }
+
+    // Set the remaining lower bits of the remainder.
+    for (; pos > 0; --pos) {
+      remainder[pos - 1] = val[pos - 1];
+    }
+
+    *this = quotient;
+    return remainder;
+  }
+
   constexpr UInt<Bits> operator/(const UInt<Bits> &other) const {
     UInt<Bits> result(*this);
     result.div(other);
diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp
index fb266c25da26..77a6e6b2b39b 100644
--- a/libc/test/src/__support/uint_test.cpp
+++ b/libc/test/src/__support/uint_test.cpp
@@ -533,3 +533,30 @@ TEST(LlvmLibcUIntClassTest, ConstexprInitTests) {
   constexpr LL_UInt128 sub = LL_UInt128(5) - LL_UInt128(4);
   ASSERT_EQ(sub, LL_UInt128(1));
 }
+
+#define TEST_QUICK_DIV_UINT32_POW2(x, e)                                       \
+  do {                                                                         \
+    LL_UInt320 y({0x8899aabbccddeeffULL, 0x0011223344556677ULL,                \
+                  0x583715f4d3b29171ULL, 0xffeeddccbbaa9988ULL,                \
+                  0x1f2f3f4f5f6f7f8fULL});                                     \
+    LL_UInt320 d = LL_UInt320(x);                                              \
+    d <<= e;                                                                   \
+    LL_UInt320 q1 = y / d;                                                     \
+    LL_UInt320 r1 = y % d;                                                     \
+    LL_UInt320 r2 = *y.div_uint32_times_pow_2(x, e);                           \
+    EXPECT_EQ(q1, y);                                                          \
+    EXPECT_EQ(r1, r2);                                                         \
+  } while (0)
+
+TEST(LlvmLibcUIntClassTest, DivUInt32TimesPow2Tests) {
+  for (size_t i = 0; i < 320; i += 32) {
+    TEST_QUICK_DIV_UINT32_POW2(1, i);
+    TEST_QUICK_DIV_UINT32_POW2(13151719, i);
+  }
+
+  TEST_QUICK_DIV_UINT32_POW2(1, 75);
+  TEST_QUICK_DIV_UINT32_POW2(1, 101);
+
+  TEST_QUICK_DIV_UINT32_POW2(1000000000, 75);
+  TEST_QUICK_DIV_UINT32_POW2(1000000000, 101);
+}
author	Tue Ly <lntue@google.com>	2023-05-12 15:53:59 -0400
committer	Tue Ly <lntue@google.com>	2023-05-12 21:01:15 -0400
commit	36b702901a0bff76c1f22926b7f4744f6b760659 (patch)
tree	ad4d3c1cb33a4a90370fe35af4caa7776bb1ee35 /libc
parent	4cc2010f0368b71316acd5895f9a016e7f34c5fb (diff)
download	llvm-36b702901a0bff76c1f22926b7f4744f6b760659.tar.gz