From 4efbb58f64cfc98d4cb3977795781e2175e8803f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= Date: Thu, 3 Jun 2021 23:50:08 +0200 Subject: Micro-optimization and docs for mpn_div_qr_1n_pi1 --- mpn/generic/div_qr_1n_pi1.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'mpn') diff --git a/mpn/generic/div_qr_1n_pi1.c b/mpn/generic/div_qr_1n_pi1.c index 8e5c1b0a4..cfa1da17b 100644 --- a/mpn/generic/div_qr_1n_pi1.c +++ b/mpn/generic/div_qr_1n_pi1.c @@ -193,6 +193,23 @@ mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh, #elif DIV_QR_1N_METHOD == 2 +/* The main idea of this algorithm is to write B^2 = d (B + dinv) + + B2, where 1 <= B2 < d. Similarly to mpn_mod_1_1p, each iteration + can then replace + + u1 B^2 = u1 B2 (mod d) + + which gives a very short critical path for computing the remainder + (with some tricks to handle the carry when the next two lower limbs + are added in). To also get the quotient, include the corresponding + multiple of d in the expression, + + u1 B^2 = u1 B2 + (u1 dinv + u1 B) d + + We get the quotient by accumulating the (u1 dinv + u1 B) terms. The + two multiplies, u1 * B2 and u1 * dinv, are independent, and can be + executed in parallel. + */ mp_limb_t mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, mp_limb_t d, mp_limb_t dinv) @@ -248,17 +265,17 @@ mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, * +---+---+---+ */ umul_ppmm (p1, t, u1, dinv); + ADDC_LIMB (cy, u0, u0, u2 & B2); + u0 -= (-cy) & d; add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1); - add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1); add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0); q0 = t; + /* Note that p1 + cy cannot overflow */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1 + cy); + umul_ppmm (p1, p0, u1, B2); - ADDC_LIMB (cy, u0, u0, u2 & B2); - u0 -= (-cy) & d; - /* Final q update */ - add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy); qp[j+1] = q1; MPN_INCR_U (qp+j+2, n-j-2, q2); -- cgit v1.2.1