diff options
author | Marco Bodrato <bodrato@mail.dm.unipi.it> | 2015-05-15 08:00:27 +0200 |
---|---|---|
committer | Marco Bodrato <bodrato@mail.dm.unipi.it> | 2015-05-15 08:00:27 +0200 |
commit | 9d3d9d9e4c10e9b01467a1c4908e8a3b88004fb0 (patch) | |
tree | ec07e9e4842e281565238e97f772bfd8a91f9a95 | |
parent | 792ffc717a22821a1551633eb4fc97aac622580b (diff) | |
download | gmp-9d3d9d9e4c10e9b01467a1c4908e8a3b88004fb0.tar.gz |
mpn/generic/invertappr.c (mpn_ni_invertappr): Reduce memory used (and related updates)
-rw-r--r-- | gmp-impl.h | 2 | ||||
-rw-r--r-- | mpn/generic/invertappr.c | 33 | ||||
-rw-r--r-- | tune/tuneup.c | 6 |
3 files changed, 20 insertions, 21 deletions
diff --git a/gmp-impl.h b/gmp-impl.h index 0ab21c04e..db330effc 100644 --- a/gmp-impl.h +++ b/gmp-impl.h @@ -1466,7 +1466,7 @@ __GMP_DECLSPEC void mpn_invert (mp_ptr, mp_srcptr, mp_size_t, mp_ptr); __GMP_DECLSPEC mp_limb_t mpn_ni_invertappr (mp_ptr, mp_srcptr, mp_size_t, mp_ptr); #define mpn_invertappr __MPN(invertappr) __GMP_DECLSPEC mp_limb_t mpn_invertappr (mp_ptr, mp_srcptr, mp_size_t, mp_ptr); -#define mpn_invertappr_itch(n) (2 * (n) + 4) +#define mpn_invertappr_itch(n) (2 * (n)) #define mpn_binvert __MPN(binvert) __GMP_DECLSPEC void mpn_binvert (mp_ptr, mp_srcptr, mp_size_t, mp_ptr); diff --git a/mpn/generic/invertappr.c b/mpn/generic/invertappr.c index f280dbf41..b8fdac7fc 100644 --- a/mpn/generic/invertappr.c +++ b/mpn/generic/invertappr.c @@ -144,6 +144,9 @@ mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp) B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads "2B^n-1"). + Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn + in the scratch, i.e. 3*rn <= 2*n: we require n>4. + We use a wrapped product modulo B^m-1. NOTE: is there any normalisation problem for the [0] class? It shouldn't: we compute 2*|A*X_h - B^{n+h}| < B^m-1. We may get [0] if and only if we get AX_h = B^{n+h}. This can @@ -159,14 +162,13 @@ mp_limb_t mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) { mp_limb_t cy; - mp_ptr rp; mp_size_t rn, mn; mp_size_t sizes[NPOWS], *sizp; mp_ptr tp; TMP_DECL; #define xp scratch - ASSERT (n > 2); + ASSERT (n > 4); ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); @@ -198,9 +200,6 @@ mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) } /* Use Newton's iterations to get the desired precision.*/ - /* define rp scratch; 2rn + 1 limbs <= 2(n>>1 + 1) + 1 <= n + 3 limbs */ - /* Maximum scratch needed by this branch <= 2*n + 4 - USE_MUL_N */ - rp = xp + n; /* n + 3 limbs */ while (1) { n = *--sizp; /* @@ -253,16 +252,16 @@ mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) /* 1 <= cy <= 3 here. */ #if HAVE_NATIVE_mpn_rsblsh1_n if (mpn_cmp (xp, dp - n, n) > 0) { - ASSERT_NOCARRY (mpn_rsblsh1_n (xp, xp, dp - n, n)); + ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n)); ++cy; } else - ASSERT_NOCARRY (mpn_sub_n (xp, dp - n, xp, n)); -#else + ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0)); +#else /* no mpn_rsblsh1_n*/ if (mpn_cmp (xp, dp - n, n) > 0) { ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n)); ++cy; } - ASSERT_NOCARRY (mpn_sub_n (xp, dp - n, xp, n)); + ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0)); #endif MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */ } else { /* "negative" residue class */ @@ -272,18 +271,18 @@ mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) MPN_INCR_U(ip - rn, rn, CNST_LIMB (1)); ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n)); } - mpn_com (xp + n - rn, xp + n - rn, rn); + mpn_com (xp + 2 * n - rn, xp + n - rn, rn); } - /* Compute x_ju_j. FIXME:We need {rp+rn,rn}, mulhi? */ - mpn_mul_n (rp, xp + n - rn, ip - rn, rn); - cy = mpn_add_n (rp + rn, rp + rn, xp + n - rn, 2*rn - n); - cy = mpn_add_nc (ip - n, rp + 3*rn - n, xp + rn, n - rn, cy); + /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */ + mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn); + cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n); + cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy); MPN_INCR_U (ip - rn, rn, cy); if (sizp == sizes) { /* Get out of the cycle */ /* Check for possible carry propagation from below. */ - cy = rp[3*rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */ -/* cy = mpn_add_1 (rp + rn, rp + rn, 2*rn - n, 4); */ + cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */ + /* cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */ break; } rn = n; @@ -291,7 +290,7 @@ mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) TMP_FREE; return cy; -#undef rp +#undef xp } mp_limb_t diff --git a/tune/tuneup.c b/tune/tuneup.c index ff0adece6..32f2b765a 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -1694,12 +1694,12 @@ tune_invertappr (void) param.function = speed_mpn_ni_invertappr; param.name = "INV_MULMOD_BNM1_THRESHOLD"; - param.min_size = 4; + param.min_size = 5; one (&inv_mulmod_bnm1_threshold, ¶m); param.function = speed_mpn_invertappr; param.name = "INV_NEWTON_THRESHOLD"; - param.min_size = 3; + param.min_size = 5; one (&inv_newton_threshold, ¶m); } @@ -1710,7 +1710,7 @@ tune_invert (void) param.function = speed_mpn_invert; param.name = "INV_APPR_THRESHOLD"; - param.min_size = 3; + param.min_size = 5; one (&inv_appr_threshold, ¶m); } |