summaryrefslogtreecommitdiff
path: root/mpn
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2021-06-03 23:50:08 +0200
committerNiels Möller <nisse@lysator.liu.se>2021-06-03 23:50:08 +0200
commit4efbb58f64cfc98d4cb3977795781e2175e8803f (patch)
tree30aaa7205c80ddc86ad1426efcf36acac9a60974 /mpn
parent5cb6bdf47d96d275ee379bd171273662f80f7758 (diff)
downloadgmp-4efbb58f64cfc98d4cb3977795781e2175e8803f.tar.gz
Micro-optimization and docs for mpn_div_qr_1n_pi1
Diffstat (limited to 'mpn')
-rw-r--r--mpn/generic/div_qr_1n_pi1.c27
1 files changed, 22 insertions, 5 deletions
diff --git a/mpn/generic/div_qr_1n_pi1.c b/mpn/generic/div_qr_1n_pi1.c
index 8e5c1b0a4..cfa1da17b 100644
--- a/mpn/generic/div_qr_1n_pi1.c
+++ b/mpn/generic/div_qr_1n_pi1.c
@@ -193,6 +193,23 @@ mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh,
#elif DIV_QR_1N_METHOD == 2
+/* The main idea of this algorithm is to write B^2 = d (B + dinv) +
+ B2, where 1 <= B2 < d. Similarly to mpn_mod_1_1p, each iteration
+ can then replace
+
+ u1 B^2 = u1 B2 (mod d)
+
+ which gives a very short critical path for computing the remainder
+ (with some tricks to handle the carry when the next two lower limbs
+ are added in). To also get the quotient, include the corresponding
+ multiple of d in the expression,
+
+ u1 B^2 = u1 B2 + (u1 dinv + u1 B) d
+
+ We get the quotient by accumulating the (u1 dinv + u1 B) terms. The
+ two multiplies, u1 * B2 and u1 * dinv, are independent, and can be
+ executed in parallel.
+ */
mp_limb_t
mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
mp_limb_t d, mp_limb_t dinv)
@@ -248,17 +265,17 @@ mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
* +---+---+---+
*/
umul_ppmm (p1, t, u1, dinv);
+ ADDC_LIMB (cy, u0, u0, u2 & B2);
+ u0 -= (-cy) & d;
add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
- add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1);
add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
q0 = t;
+ /* Note that p1 + cy cannot overflow */
+ add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1 + cy);
+
umul_ppmm (p1, p0, u1, B2);
- ADDC_LIMB (cy, u0, u0, u2 & B2);
- u0 -= (-cy) & d;
- /* Final q update */
- add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy);
qp[j+1] = q1;
MPN_INCR_U (qp+j+2, n-j-2, q2);