1 files changed, 120 insertions, 0 deletions
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index 8b273e368964..a702aaad827b 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -342,6 +342,126 @@ template <size_t Bits> struct UInt {
     return remainder;
   }
 
+  // Efficiently perform UInt / (x * 2^e), where x is a 32-bit unsigned integer,
+  // and return the remainder.
+  // The main idea is as follow:
+  //   Let q = y / (x * 2^e) be the quotient, and
+  //       r = y % (x * 2^e) be the remainder.
+  //   First, notice that:
+  //     r % (2^e) = y % (2^e),
+  // so we just need to focus on all the bits of y that is >= 2^e.
+  //   To speed up the shift-and-add steps, we only use x as the divisor, and
+  // performing 32-bit shiftings instead of bit-by-bit shiftings.
+  //   Since the remainder of each division step < x < 2^32, the computation of
+  // each step is now properly contained within uint64_t.
+  //   And finally we perform some extra alignment steps for the remaining bits.
+  constexpr optional<UInt<Bits>> div_uint32_times_pow_2(uint32_t x, size_t e) {
+    UInt<Bits> remainder(0);
+
+    if (x == 0) {
+      return nullopt;
+    }
+    if (e >= Bits) {
+      remainder = *this;
+      *this = UInt<Bits>(0);
+      return remainder;
+    }
+
+    UInt<Bits> quotient(0);
+    uint64_t x64 = static_cast<uint64_t>(x);
+    // lower64 = smallest multiple of 64 that is >= e.
+    size_t lower64 = ((e >> 6) + ((e & 63) != 0)) << 6;
+    // lower_pos is the index of the closest 64-bit chunk >= 2^e.
+    size_t lower_pos = lower64 / 64;
+    // Keep track of current remainder mod x * 2^(32*i)
+    uint64_t rem = 0;
+    // pos is the index of the current 64-bit chunk that we are processing.
+    size_t pos = WORDCOUNT;
+
+    for (size_t q_pos = WORDCOUNT - lower_pos; q_pos > 0; --q_pos) {
+      // q_pos is 1 + the index of the current 64-bit chunk of the quotient
+      // being processed.
+      // Performing the division / modulus with divisor:
+      //   x * 2^(64*q_pos - 32),
+      // i.e. using the upper 32-bit of the current 64-bit chunk.
+      rem <<= 32;
+      rem += val[--pos] >> 32;
+      uint64_t q_tmp = rem / x64;
+      rem %= x64;
+
+      // Performing the division / modulus with divisor:
+      //   x * 2^(64*(q_pos - 1)),
+      // i.e. using the lower 32-bit of the current 64-bit chunk.
+      rem <<= 32;
+      rem += val[pos] & MASK32;
+      quotient.val[q_pos - 1] = (q_tmp << 32) + rem / x64;
+      rem %= x64;
+    }
+
+    // So far, what we have is:
+    //   quotient = y / (x * 2^lower64), and
+    //        rem = (y % (x * 2^lower64)) / 2^lower64.
+    // If (lower64 > e), we will need to perform an extra adjustment of the
+    // quotient and remainder, namely:
+    //   y / (x * 2^e) = [ y / (x * 2^lower64) ] * 2^(lower64 - e) +
+    //                   + (rem * 2^(lower64 - e)) / x
+    //   (y % (x * 2^e)) / 2^e = (rem * 2^(lower64 - e)) % x
+    size_t last_shift = lower64 - e;
+
+    if (last_shift > 0) {
+      // quotient * 2^(lower64 - e)
+      quotient <<= last_shift;
+      uint64_t q_tmp = 0;
+      uint64_t d = val[--pos];
+      if (last_shift >= 32) {
+        // The shifting (rem * 2^(lower64 - e)) might overflow uint64_t, so we
+        // perform a 32-bit shift first.
+        rem <<= 32;
+        rem += d >> 32;
+        d &= MASK32;
+        q_tmp = rem / x64;
+        rem %= x64;
+        last_shift -= 32;
+      } else {
+        // Only use the upper 32-bit of the current 64-bit chunk.
+        d >>= 32;
+      }
+
+      if (last_shift > 0) {
+        rem <<= 32;
+        rem += d;
+        q_tmp <<= last_shift;
+        x64 <<= 32 - last_shift;
+        q_tmp += rem / x64;
+        rem %= x64;
+      }
+
+      quotient.val[0] += q_tmp;
+
+      if (lower64 - e <= 32) {
+        // The remainder rem * 2^(lower64 - e) might overflow to the higher
+        // 64-bit chunk.
+        if (pos < WORDCOUNT - 1) {
+          remainder[pos + 1] = rem >> 32;
+        }
+        remainder[pos] = (rem << 32) + (val[pos] & MASK32);
+      } else {
+        remainder[pos] = rem;
+      }
+
+    } else {
+      remainder[pos] = rem;
+    }
+
+    // Set the remaining lower bits of the remainder.
+    for (; pos > 0; --pos) {
+      remainder[pos - 1] = val[pos - 1];
+    }
+
+    *this = quotient;
+    return remainder;
+  }
+
   constexpr UInt<Bits> operator/(const UInt<Bits> &other) const {
     UInt<Bits> result(*this);
     result.div(other);