From 4efbb58f64cfc98d4cb3977795781e2175e8803f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Thu, 3 Jun 2021 23:50:08 +0200
Subject: Micro-optimization and docs for mpn_div_qr_1n_pi1

---
 mpn/generic/div_qr_1n_pi1.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

(limited to 'mpn')

diff --git a/mpn/generic/div_qr_1n_pi1.c b/mpn/generic/div_qr_1n_pi1.c
index 8e5c1b0a4..cfa1da17b 100644
--- a/mpn/generic/div_qr_1n_pi1.c
+++ b/mpn/generic/div_qr_1n_pi1.c
@@ -193,6 +193,23 @@ mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh,
 
 #elif DIV_QR_1N_METHOD == 2
 
+/* The main idea of this algorithm is to write B^2 = d (B + dinv) +
+   B2, where 1 <= B2 < d. Similarly to mpn_mod_1_1p, each iteration
+   can then replace
+
+     u1 B^2 = u1 B2 (mod d)
+
+   which gives a very short critical path for computing the remainder
+   (with some tricks to handle the carry when the next two lower limbs
+   are added in). To also get the quotient, include the corresponding
+   multiple of d in the expression,
+
+     u1 B^2 = u1 B2 + (u1 dinv + u1 B) d
+
+   We get the quotient by accumulating the (u1 dinv + u1 B) terms. The
+   two multiplies, u1 * B2 and u1 * dinv, are independent, and can be
+   executed in parallel.
+ */
 mp_limb_t
 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
 		   mp_limb_t d, mp_limb_t dinv)
@@ -248,17 +265,17 @@ mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
        *    +---+---+---+
       */
       umul_ppmm (p1, t, u1, dinv);
+      ADDC_LIMB (cy, u0, u0, u2 & B2);
+      u0 -= (-cy) & d;
       add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
-      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1);
       add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
       q0 = t;
 
+      /* Note that p1 + cy cannot overflow */
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1 + cy);
+
       umul_ppmm (p1, p0, u1, B2);
-      ADDC_LIMB (cy, u0, u0, u2 & B2);
-      u0 -= (-cy) & d;
 
-      /* Final q update */
-      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy);
       qp[j+1] = q1;
       MPN_INCR_U (qp+j+2, n-j-2, q2);
 
-- 
cgit v1.2.1