diff options
author | tege <tege@gmplib.org> | 2000-04-15 22:24:34 +0200 |
---|---|---|
committer | tege <tege@gmplib.org> | 2000-04-15 22:24:34 +0200 |
commit | 7b9a3c35feeb183c3f89ae8430d97ed3813db2ef (patch) | |
tree | eb16400131397ad7ac45072dd46c4a7c20165a73 | |
parent | 95d0da0adb5f39c7e6a9772000b47b0b96d38fdb (diff) | |
download | gmp-7b9a3c35feeb183c3f89ae8430d97ed3813db2ef.tar.gz |
Replacement from Paul Zimmermann.
-rw-r--r-- | mpz/powm.c | 479 |
1 files changed, 282 insertions, 197 deletions
diff --git a/mpz/powm.c b/mpz/powm.c index f596436e2..8d670cd8c 100644 --- a/mpz/powm.c +++ b/mpz/powm.c @@ -1,6 +1,7 @@ /* mpz_powm(res,base,exp,mod) -- Set RES to (base**exp) mod MOD. -Copyright (C) 1991, 1993, 1994, 1996, 1997 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, Inc. +Contributed by Paul Zimmermann. This file is part of the GNU MP Library. @@ -23,256 +24,340 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" #include "longlong.h" +/* returns -1/m mod 2^BITS_PER_MP_LIMB, suppose m odd */ +static mp_limb_t +mpz_dmprepare (m) + mpz_t m; +{ + mp_limb_t m0, x; int k; + + m0 = PTR (m)[0]; + ASSERT_ALWAYS (m0 % 2 != 0); + + /* quadratic Hensel lifting (or Newton iteration) */ + x = 1; k = 1; /* invariant: x * m0 = 1 mod 2^k */ + while (k < BITS_PER_MP_LIMB) + { + x += x * (1 - x * m0); + k <<= 1; + } + return -x; +} + +/* set c <- (a*b)/R^n mod m c has to have at least (2n) allocated limbs */ +static void +mpz_redc (c, a, b, m, Nprim) + mpz_t c, a, b, m; + mp_limb_t Nprim; +{ + mp_ptr cp, mp = PTR (m); + mp_limb_t cy, cout = 0; + mp_limb_t q; + size_t j, n = SIZ (m); + + ASSERT (ALLOC (c) >= 2 * n); + + mpz_mul (c, a, b); + cp = PTR (c); + j = ABSIZ (c); + MPN_ZERO (cp + j, 2 * n - j); + for (j = 0; j < n; j++) + { + q = cp[0] * Nprim; + cy = mpn_addmul_1 (cp, mp, n, q); + cout += mpn_add_1 (cp + n, cp + n, n - j, cy); + cp++; + } + cp -= n; + if (cout) + { + cy = cout - mpn_sub_n (cp, cp + n, mp, n); + while (cy) + cy -= mpn_sub_n (cp, cp, mp, n); + } + else + MPN_COPY (cp, cp + n, n); + MPN_NORMALIZE (cp, n); + SIZ (c) = SIZ (c) < 0 ? -n : n; +} + +/* average number of calls to redc for an exponent of n bits + with the sliding window algorithm of base 2^k: the optimal is + obtained for the value of k which minimizes 2^(k-1)+n/(k+1): + + n\k 4 5 6 7 8 + 128 156* 159 171 200 261 + 256 309 307* 316 343 403 + 512 617 607* 610 632 688 + 1024 1231 1204 1195* 1207 1256 + 2048 2461 2399 2366 2360* 2396 + 4096 4918 4787 4707 4665* 4670 +*/ + #ifndef BERKELEY_MP void #if __STDC__ -mpz_powm (mpz_ptr res, mpz_srcptr base, mpz_srcptr exp, mpz_srcptr mod) +mpz_powm (mpz_ptr res, mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod) #else -mpz_powm (res, base, exp, mod) +mpz_powm (res, base, e, mod) mpz_ptr res; mpz_srcptr base; - mpz_srcptr exp; + mpz_srcptr e; mpz_srcptr mod; #endif #else /* BERKELEY_MP */ void #if __STDC__ -pow (mpz_srcptr base, mpz_srcptr exp, mpz_srcptr mod, mpz_ptr res) +pow (mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod, mpz_ptr res) #else -pow (base, exp, mod, res) +pow (base, e, mod, res) mpz_srcptr base; - mpz_srcptr exp; + mpz_srcptr e; mpz_srcptr mod; mpz_ptr res; #endif #endif /* BERKELEY_MP */ { - mp_ptr rp, ep, mp, bp; - mp_size_t esize, msize, bsize, rsize; - mp_size_t size; - int mod_shift_cnt; - int negative_result; - mp_limb_t *free_me = NULL; - size_t free_me_size; - TMP_DECL (marker); - - esize = ABS (exp->_mp_size); - msize = ABS (mod->_mp_size); - size = 2 * msize; + mp_limb_t invm, *ep, c, mask; + mpz_t xx, *g; + mp_size_t n, i, K, j, l, k; + int sh; + int use_redc; + +#ifdef DEBUG + mpz_t exp; + mpz_init (exp); +#endif - rp = res->_mp_d; - ep = exp->_mp_d; + n = SIZ (mod); - if (msize == 0) - msize = 1 / msize; /* provoke a signal */ + if (n == 0) + DIVIDE_BY_ZERO; - if (esize == 0) + if (SIZ (e) == 0) { /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0 - depending on if MOD equals 1. */ - rp[0] = 1; - res->_mp_size = (msize == 1 && (mod->_mp_d)[0] == 1) ? 0 : 1; + depending on if MOD equals 1. */ + PTR(res)[0] = 1; + SIZ(res) = (ABSIZ (mod) == 1 && (PTR(mod))[0] == 1) ? 0 : 1; return; } - TMP_MARK (marker); - - /* Normalize MOD (i.e. make its most significant bit set) as required by - mpn_divmod. This will make the intermediate values in the calculation - slightly larger, but the correct result is obtained after a final - reduction using the original MOD value. */ + /* Use REDC instead of usual reduction for small sizes, more precisely using + REDC each modular multiplication cost about 2*n^2 limbs operations, + whereas using usual reduction it costs 3*K(n), where K(n) is the cost of a + multiplication using Karatsuba, and a division is assumed to cost 2*K(n), + for example using Burnikel-Ziegler's algorithm. This gives a theoretical + threshold of a*KARATSUBA_SQR_THRESHOLD, with a=(3/2)^(1/(2-ln(3)/ln(2))) ~ + 2.66. */ + /* For now, also disable REDC when MOD is even, as mpz_dmprepare cannot cope + with that. */ + + use_redc = (3 * n < 8 * KARATSUBA_SQR_THRESHOLD && PTR(mod)[0] % 2 != 0); + if (use_redc) + invm = mpz_dmprepare (mod); + + /* determines optimal value of k */ + l = ABSIZ (e) * BITS_PER_MP_LIMB; /* number of bits of exponent */ + k = 1; + K = 2; + while (2 * l > K * (2 + k * (3 + k))) + { + k++; + K *= 2; + } - mp = (mp_ptr) TMP_ALLOC (msize * BYTES_PER_MP_LIMB); - count_leading_zeros (mod_shift_cnt, mod->_mp_d[msize - 1]); - if (mod_shift_cnt != 0) - mpn_lshift (mp, mod->_mp_d, msize, mod_shift_cnt); + g = (*_mp_allocate_func) (K / 2 * sizeof (mpz_t)); + /* compute x*R^n where R=2^BITS_PER_MP_LIMB */ + mpz_init (g[0]); + if (use_redc) + { + mpz_mul_2exp (g[0], base, n * BITS_PER_MP_LIMB); + mpz_mod (g[0], g[0], mod); + } else - MPN_COPY (mp, mod->_mp_d, msize); + mpz_mod (g[0], base, mod); - bsize = ABS (base->_mp_size); - if (bsize > msize) + /* compute xx^g for odd g < 2^k */ + mpz_init (xx); + if (use_redc) { - /* The base is larger than the module. Reduce it. */ - - /* Allocate (BSIZE + 1) with space for remainder and quotient. - (The quotient is (bsize - msize + 1) limbs.) */ - bp = (mp_ptr) TMP_ALLOC ((bsize + 1) * BYTES_PER_MP_LIMB); - MPN_COPY (bp, base->_mp_d, bsize); - /* We don't care about the quotient, store it above the remainder, - at BP + MSIZE. */ - mpn_divmod (bp + msize, bp, bsize, mp, msize); - bsize = msize; - /* Canonicalize the base, since we are going to multiply with it - quite a few times. */ - MPN_NORMALIZE (bp, bsize); + _mpz_realloc (xx, 2 * n); + mpz_redc (xx, g[0], g[0], mod, invm); /* xx = x^2*R^n */ } else - bp = base->_mp_d; - - if (bsize == 0) { - res->_mp_size = 0; - TMP_FREE (marker); - return; + mpz_mul (xx, g[0], g[0]); + mpz_mod (xx, xx, mod); } - - if (res->_mp_alloc < size) + for (i = 1; i < K / 2; i++) { - /* We have to allocate more space for RES. If any of the input - parameters are identical to RES, defer deallocation of the old - space. */ - - if (rp == ep || rp == mp || rp == bp) + mpz_init (g[i]); + if (use_redc) { - free_me = rp; - free_me_size = res->_mp_alloc; + _mpz_realloc (g[i], 2 * n); + mpz_redc (g[i], g[i - 1], xx, mod, invm); /* g[i] = x^(2i+1)*R^n */ } else - (*_mp_free_func) (rp, res->_mp_alloc * BYTES_PER_MP_LIMB); - - rp = (mp_ptr) (*_mp_allocate_func) (size * BYTES_PER_MP_LIMB); - res->_mp_alloc = size; - res->_mp_d = rp; - } - else - { - /* Make BASE, EXP and MOD not overlap with RES. */ - if (rp == bp) { - /* RES and BASE are identical. Allocate temp. space for BASE. */ - bp = (mp_ptr) TMP_ALLOC (bsize * BYTES_PER_MP_LIMB); - MPN_COPY (bp, rp, bsize); + mpz_mul (g[i], g[i - 1], xx); + mpz_mod (g[i], g[i], mod); } - if (rp == ep) + } + + /* now starts the real stuff */ + mask = (mp_limb_t) ((1<<k) - 1); + ep = PTR (e); + i = SIZ (e) - 1; /* current index */ + c = ep[i]; /* current limb */ + count_leading_zeros (sh, c); + sh = BITS_PER_MP_LIMB - sh; /* significant bits in ep[i] */ + sh -= k; /* index of lower bit of ep[i] to take into account */ + if (sh < 0) + { /* k-sh extra bits are needed */ + if (i > 0) { - /* RES and EXP are identical. Allocate temp. space for EXP. */ - ep = (mp_ptr) TMP_ALLOC (esize * BYTES_PER_MP_LIMB); - MPN_COPY (ep, rp, esize); + i--; + c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh)); + sh += BITS_PER_MP_LIMB; } - if (rp == mp) + } + else + c = c >> sh; +#ifdef DEBUG + printf ("-1/m mod 2^%u = %lu\n", BITS_PER_MP_LIMB, invm); + mpz_set_ui (exp, c); +#endif + j=0; + while (c % 2 == 0) + { + j++; + c = (c >> 1); + } + mpz_set (xx, g[c >> 1]); + while (j--) + { + if (use_redc) + mpz_redc (xx, xx, xx, mod, invm); + else { - /* RES and MOD are identical. Allocate temporary space for MOD. */ - mp = (mp_ptr) TMP_ALLOC (msize * BYTES_PER_MP_LIMB); - MPN_COPY (mp, rp, msize); + mpz_mul (xx, xx, xx); + mpz_mod (xx, xx, mod); } } - MPN_COPY (rp, bp, bsize); - rsize = bsize; - - { - mp_size_t i; - mp_ptr xp = (mp_ptr) TMP_ALLOC (2 * (msize + 1) * BYTES_PER_MP_LIMB); - int c; - mp_limb_t e; - mp_limb_t carry_limb; - - negative_result = (ep[0] & 1) && base->_mp_size < 0; - - i = esize - 1; - e = ep[i]; - count_leading_zeros (c, e); - e = (e << c) << 1; /* shift the exp bits to the left, lose msb */ - c = BITS_PER_MP_LIMB - 1 - c; - - /* Main loop. - - Make the result be pointed to alternately by XP and RP. This - helps us avoid block copying, which would otherwise be necessary - with the overlap restrictions of mpn_divmod. With 50% probability - the result after this loop will be in the area originally pointed - by RP (==RES->_mp_d), and with 50% probability in the area originally - pointed to by XP. */ +#ifdef DEBUG + printf ("x^"); mpz_out_str (0, 10, exp); + printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx); + putchar ('\n'); +#endif - for (;;) - { - while (c != 0) - { - mp_ptr tp; - mp_size_t xsize; + while (i > 0 || sh > 0) + { + c = ep[i]; + sh -= k; + l = k; /* number of bits treated */ + if (sh < 0) + { + if (i > 0) + { + i--; + c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh)); + sh += BITS_PER_MP_LIMB; + } + else + { + l += sh; /* may be less bits than k here */ + c = c & ((1<<l) - 1); + } + } + else + c = c >> sh; + c = c & mask; - mpn_mul_n (xp, rp, rp, rsize); - xsize = 2 * rsize; - xsize -= xp[xsize - 1] == 0; - if (xsize > msize) - { - mpn_divmod (xp + msize, xp, xsize, mp, msize); - xsize = msize; - } + /* this while loop implements the sliding window improvement */ + while ((c & (1 << (k - 1))) == 0 && (i > 0 || sh > 0)) + { + if (use_redc) mpz_redc (xx, xx, xx, mod, invm); + else + { + mpz_mul (xx, xx, xx); + mpz_mod (xx, xx, mod); + } + if (sh) + { + sh--; + c = (c<<1) + ((ep[i]>>sh) & 1); + } + else + { + i--; + sh = BITS_PER_MP_LIMB - 1; + c = (c<<1) + (ep[i]>>sh); + } + } - tp = rp; rp = xp; xp = tp; - rsize = xsize; +#ifdef DEBUG + printf ("l=%u c=%lu\n", l, c); + mpz_mul_2exp (exp, exp, k); + mpz_add_ui (exp, exp, c); +#endif - if ((mp_limb_signed_t) e < 0) + /* now replace xx by xx^(2^k)*x^c */ + if (c != 0) + { + j = 0; + while (c % 2 == 0) + { + j++; + c = c >> 1; + } + /* c0 = c * 2^j, i.e. xx^(2^k)*x^c = (A^(2^(k - j))*c)^(2^j) */ + l -= j; + while (l--) + if (use_redc) mpz_redc (xx, xx, xx, mod, invm); + else { - mpn_mul (xp, rp, rsize, bp, bsize); - xsize = rsize + bsize; - xsize -= xp[xsize - 1] == 0; - if (xsize > msize) - { - mpn_divmod (xp + msize, xp, xsize, mp, msize); - xsize = msize; - } - - tp = rp; rp = xp; xp = tp; - rsize = xsize; + mpz_mul (xx, xx, xx); + mpz_mod (xx, xx, mod); } - e <<= 1; - c--; - } - - i--; - if (i < 0) - break; - e = ep[i]; - c = BITS_PER_MP_LIMB; - } - - /* We shifted MOD, the modulo reduction argument, left MOD_SHIFT_CNT - steps. Adjust the result by reducing it with the original MOD. - - Also make sure the result is put in RES->_mp_d (where it already - might be, see above). */ - - if (mod_shift_cnt != 0) - { - carry_limb = mpn_lshift (res->_mp_d, rp, rsize, mod_shift_cnt); - rp = res->_mp_d; - if (carry_limb != 0) - { - rp[rsize] = carry_limb; - rsize++; - } - } - else - { - MPN_COPY (res->_mp_d, rp, rsize); - rp = res->_mp_d; - } - - if (rsize >= msize) - { - mpn_divmod (rp + msize, rp, rsize, mp, msize); - rsize = msize; - } - - /* Remove any leading zero words from the result. */ - if (mod_shift_cnt != 0) - mpn_rshift (rp, rp, rsize, mod_shift_cnt); - MPN_NORMALIZE (rp, rsize); - } + if (use_redc) + mpz_redc (xx, xx, g[c >> 1], mod, invm); + else + { + mpz_mul (xx, xx, g[c >> 1]); + mpz_mod (xx, xx, mod); + } + } + else + j = l; /* case c=0 */ + while (j--) + { + if (use_redc) + mpz_redc (xx, xx, xx, mod, invm); + else + { + mpz_mul (xx, xx, xx); + mpz_mod (xx, xx, mod); + } + } +#ifdef DEBUG + printf ("x^"); mpz_out_str (0, 10, exp); + printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx); + putchar ('\n'); +#endif + } - if (negative_result && rsize != 0) + /* now convert back xx to xx/R^n */ + if (use_redc) { - if (mod_shift_cnt != 0) - mpn_rshift (mp, mp, msize, mod_shift_cnt); - mpn_sub (rp, mp, msize, rp, rsize); - rsize = msize; - MPN_NORMALIZE (rp, rsize); + mpz_set_ui (g[0], 1); + mpz_redc (xx, xx, g[0], mod, invm); } - res->_mp_size = rsize; + mpz_set (res, xx); - if (free_me != NULL) - (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB); - TMP_FREE (marker); + mpz_clear (xx); + for (i = 0; i < K / 2; i++) + mpz_clear (g[i]); + (*_mp_free_func) (g, K / 2 * sizeof (mpz_t)); } |