From a9b1a2f0a7a8d29a9d77df3a5973629ec622c3bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= Date: Thu, 1 Jul 2021 20:34:15 +0200 Subject: New mpn_div_qr_1n_pi1 variants, DIV_QR_1N_METHOD 3 and 4. Not enabled, but hooked into the speed and tuneup programs. --- mpn/generic/div_qr_1n_pi1.c | 202 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) (limited to 'mpn') diff --git a/mpn/generic/div_qr_1n_pi1.c b/mpn/generic/div_qr_1n_pi1.c index cfa1da17b..8d139a9e6 100644 --- a/mpn/generic/div_qr_1n_pi1.c +++ b/mpn/generic/div_qr_1n_pi1.c @@ -298,6 +298,208 @@ mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, return u0; } +#elif DIV_QR_1N_METHOD == 3 + +/* This variant handles carry from the u update earlier. This gives a + longer critical path, but reduces the work needed for the + quotients. */ +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t cy, u0; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + u0 = up[n-1]; /* Early read, to allow qp == up. */ + + add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0); + u1 -= cy & d; + q1 -= cy; + qp[n-1] = q1; + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + mp_limb_t t1, t0; + + /* Additions for the q update: + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+ + * | 1 | (conditional on {u1, u0} carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + * + * Additions for the u update: + * +-------+ + * |u1 * B2| + * +---+---+ + * + |u0 |u-1| + * +---+---+ + * - | d | (conditional on carry) + * ---+---+---+ + * |u1 | u0| + * +---+---+ + * + */ + umul_ppmm (p1, p0, u1, B2); + ADDC_LIMB (q2, q1, u1, q0); + umul_ppmm (t1, t0, u1, dinv); + add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0); + u1 -= cy & d; + + /* t1 <= B-2, so cy can be added in without overflow. */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy); + q0 = t0; + + /* Final q update */ + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + } + + q1 = (u1 >= d); + u1 -= (-q1) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} + +#elif DIV_QR_1N_METHOD == 4 + +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t u2, u0; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t B2d0, B2d1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + /* B2 * (B-d) */ + umul_ppmm (B2d1, B2d0, B2, -d); + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + + add_mssaaaa (u2, u1, u0, up[n-1], up[n-2], p1, p0); + + /* After read of up[n-1], to allow qp == up. */ + qp[n-1] = q1 - u2; + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + mp_limb_t t1, t0; + + /* Additions for the q update. *After* u1 -= u2 & d adjustment. + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+ + * | 1 | (conditional on {u1, u0} carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + * + * Additions for the u update. *Before* u1 -= u2 & d adjstment. + * +-------+ + * |u1 * B2| + * +---+---+ + * |u0 |u-1| + * +---+---+ + + + |B2(B-d)| (conditional on u2) + * -+---+---+---+ + * |u2 |u1 | u0| + * +---+---+---+ + * + */ + /* Multiply with unadjusted u1, to shorten critical path. */ + umul_ppmm (p1, p0, u1, B2); + u1 -= (d & u2); + ADDC_LIMB (q2, q1, u1, q0); + umul_ppmm (t1, t0, u1, dinv); + + add_mssaaaa (cy, u1, u0, u0, up[j], u2 & B2d1, u2 & B2d0); + add_mssaaaa (u2, u1, u0, u1, u0, p1, p0); + u2 += cy; + ASSERT(-u2 <= 1); + + /* t1 <= B-2, so u2 can be added in without overflow. */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - u2); + q0 = t0; + + /* Final q update */ + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + } + u1 -= u2 & d; + + q1 = (u1 >= d); + u1 -= (-q1) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} #else #error Unknown DIV_QR_1N_METHOD #endif -- cgit v1.2.1