diff options
author | vlefevre <vlefevre@280ebfd0-de03-0410-8827-d642c229c3f4> | 2017-05-04 07:46:57 +0000 |
---|---|---|
committer | vlefevre <vlefevre@280ebfd0-de03-0410-8827-d642c229c3f4> | 2017-05-04 07:46:57 +0000 |
commit | 6de18e36a603b2b4c3f065119de5d5fd78367b51 (patch) | |
tree | 333e193c333ea5369d4fcd17e5fbb469dd50c40d /src | |
parent | c64ffc2f9958b4fa2eab676fee377ef0fbcaddba (diff) | |
download | mpfr-6de18e36a603b2b4c3f065119de5d5fd78367b51.tar.gz |
Merged r11179-11196 from the trunk (no conflicts).
git-svn-id: svn://scm.gforge.inria.fr/svn/mpfr/branches/faithful@11453 280ebfd0-de03-0410-8827-d642c229c3f4
Diffstat (limited to 'src')
-rw-r--r-- | src/add1sp.c | 22 | ||||
-rw-r--r-- | src/div.c | 205 | ||||
-rw-r--r-- | src/frac.c | 2 | ||||
-rw-r--r-- | src/mpfr-gmp.h | 13 | ||||
-rw-r--r-- | src/mpfr-impl.h | 3 | ||||
-rw-r--r-- | src/rndna.c | 15 | ||||
-rw-r--r-- | src/round_raw_generic.c | 12 | ||||
-rw-r--r-- | src/set_uj.c | 2 | ||||
-rw-r--r-- | src/sqr.c | 42 | ||||
-rw-r--r-- | src/sqrt.c | 56 | ||||
-rw-r--r-- | src/strtofr.c | 2 | ||||
-rw-r--r-- | src/sub1sp.c | 8 | ||||
-rw-r--r-- | src/sum.c | 2 |
13 files changed, 291 insertions, 93 deletions
diff --git a/src/add1sp.c b/src/add1sp.c index 03a8ba929..4182cfcf1 100644 --- a/src/add1sp.c +++ b/src/add1sp.c @@ -123,6 +123,8 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode, if (bx == cx) { + /* TODO: a0 = (bp[0] >> 1) + (cp[0] >> 1) is probably better + (no long constant to load in a register). */ /* since bp[0], cp[0] >= MPFR_LIMB_HIGHBIT, a carry always occurs */ a0 = MPFR_LIMB_HIGHBIT | ((bp[0] + cp[0]) >> 1); bx ++; @@ -135,14 +137,17 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode, BGreater1: d = (mpfr_uexp_t) bx - cx; mask = MPFR_LIMB_MASK(sh); + /* TODO: Should the case d < sh be removed, i.e. seen as a particular + case of d < GMP_NUMB_BITS? This case would do a bit more operations + but a test would be removed, avoiding pipeline stall issues. */ if (d < sh) { /* we can shift c by d bits to the right without losing any bit, moreover we can shift one more if there is an exponent increase */ - rb = bp[0]; - a0 = rb + (cp[0] >> d); - if (a0 < rb) /* carry */ + a0 = bp[0] + (cp[0] >> d); + if (a0 < bp[0]) /* carry */ { + MPFR_ASSERTD ((a0 & 1) == 0); a0 = MPFR_LIMB_HIGHBIT | (a0 >> 1); bx ++; } @@ -152,10 +157,9 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode, } else if (d < GMP_NUMB_BITS) /* sh <= d < GMP_NUMB_BITS */ { - rb = bp[0]; sb = cp[0] << (GMP_NUMB_BITS - d); /* bits from cp[-1] after shift */ - a0 = rb + (cp[0] >> d); - if (a0 < rb) + a0 = bp[0] + (cp[0] >> d); + if (a0 < bp[0]) /* carry */ { sb |= a0 & 1; a0 = MPFR_LIMB_HIGHBIT | (a0 >> 1); @@ -181,9 +185,9 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode, goto BGreater1; } - /* Note: we could keep the output significand in a0 for the rounding, and only - store it in ap[0] at the very end, but this seems slower on average (but better - for the worst case). */ + /* Note: we could keep the output significand in a0 for the rounding, + and only store it in ap[0] at the very end, but this seems slower + on average (but better for the worst case). */ /* now perform rounding */ if (MPFR_UNLIKELY(bx > __gmpfr_emax)) @@ -37,6 +37,7 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., #include "invert_limb.h" +#if 1 /* Given u = u1*B+u0 < d = d1*B+d0 with d normalized (high bit of d1 set), put in v = v1*B+v0 an approximation of floor(u*B^2/d), with: B = 2^GMP_NUMB_BITS and v <= floor(u*B^2/d) <= v + 16. @@ -78,11 +79,87 @@ mpfr_div2_approx (mpfr_limb_ptr v1, mpfr_limb_ptr v0, *v1 += 1 + (*v0 < x); } } +#else +/* the following is some experimental code, not fully proven correct, + and which does not seem to be faster, unless we find a way to speed up + the while loop (if 'while' is replaced by 'if' it becomes faster than + the above code, but is then wrong) */ +static void +mpfr_div2_approx (mpfr_limb_ptr Q1, mpfr_limb_ptr Q0, + mp_limb_t u1, mp_limb_t u0, + mp_limb_t v1, mp_limb_t v0) +{ + mp_limb_t inv, q1, q0, r2, r1, r0, cy; + + /* first compute an approximation of q1 */ + if (MPFR_UNLIKELY(v1 == MPFR_LIMB_MAX)) + inv = MPFR_LIMB_ZERO; + else + __gmpfr_invert_limb_approx (inv, v1 + 1); + /* now inv <= B^2/(v1+1) - B */ + umul_ppmm (q1, q0, u1, inv); + q1 += u1; + + /* we have q1 <= floor(u1*2^GMP_NUMB_BITS/v1) <= q1 + 2 */ + + umul_ppmm (r2, r1, q1, v1); + umul_ppmm (cy, r0, q1, v0); + r1 += cy; + r2 += (r1 < cy); + + /* Now r2:r1:r0 = q1 * v1:v0. While q1*v1 <= u1:0, it might be that + r2:r1 > u1:u0. First subtract r2:r1:r0 from u1:u0:0, with result + in r2:r1:r0. */ + r0 = -r0; + r1 = u0 - r1 - (r0 != 0); + r2 = u1 - r2 - (r1 > u0 || (r1 == u0 && r0 != 0)); + + /* u1:u0:0 - q1 * v1:v0 >= u1*B^2 - q1*(v1*B+B-1) >= -q1*B > -B^2 + thus r2 can be 0 or -1 here. */ + if (r2 & MPFR_LIMB_HIGHBIT) /* correction performed at most two times */ + { + ADD_LIMB(r0, v0, cy); /* r0 += v0; cy = r0 < v0 */ + ADD_LIMB(r1, v1, cy); + r2 += cy; + q1 --; + if (r2 & MPFR_LIMB_HIGHBIT) + { + ADD_LIMB(r0, v0, cy); /* r0 += v0; cy = r0 < v0 */ + ADD_LIMB(r1, v1, cy); + r2 += cy; + q1 --; + } + } + else + { + /* experimentally, the number of loops is <= 5 */ + while (r2 || r1 > v1 || (r1 == v1 && r0 >= v0)) + { + r2 -= (r1 < v1 || (r1 == v1 && r0 < v0)); + r1 -= v1 + (r0 < v0); + r0 -= v0; + q1 ++; + } + } + + /* now u1:u0:0 = q1 * d1:d0 + r1:r0, with 0 <= r1:r0 < d1:d0 */ + MPFR_ASSERTD(r2 == MPFR_LIMB_ZERO); + MPFR_ASSERTD(r1 < v1 || (r1 == v1 && r0 < v0)); + + /* the second quotient limb is approximated by r1*B / d1, + which is itself approximated by r1+(r1*inv/B) */ + umul_ppmm (q0, r0, r1, inv); + + *Q1 = q1; + *Q0 = r1 + q0; + + /* experimentally: q1:q0 <= floor(B^2*u/v) <= q1:q0 + 5 */ +} +#endif #endif /* GMP_NUMB_BITS == 64 */ -/* Special code for p=PREC(q) < GMP_NUMB_BITS, - and PREC(u), PREC(v) <= GMP_NUMB_BITS */ +/* Special code for PREC(q) = PREC(u) = PREC(v) = p < GMP_NUMB_BITS */ static int mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode) { @@ -99,31 +176,60 @@ mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode) u0 -= v0; #if GMP_NUMB_BITS == 64 /* __gmpfr_invert_limb_approx only exists for 64-bit */ - /* first try with an approximate quotient */ + /* First try with an approximate quotient. + FIXME: for p<=62 we have sh-1<2 and will never be able to round correctly. + Even for p=61 we have sh-1=2 and we can round correctly only when the two + last bist of q0 are 01, which happens with probability 25% only. */ { mp_limb_t inv; __gmpfr_invert_limb_approx (inv, v0); - umul_ppmm (q0, sb, u0, inv); + umul_ppmm (rb, sb, u0, inv); } - q0 = (q0 + u0) >> extra; - /* before the >> extra shift, q0 + u0 does not exceed the true quotient - floor(u'0*2^GMP_NUMB_BITS/v0), with error at most 2, which means the rational - quotient q satisfies q0 + u0 <= q < q0 + u0 + 3. We can round correctly except - when the last sh-1 bits of q0 are 000..000 or 111..111 or 111..110. */ + rb += u0; + q0 = rb >> extra; + /* rb does not exceed the true quotient floor(u0*2^GMP_NUMB_BITS/v0), + with error at most 2, which means the rational quotient q satisfies + rb <= q < rb + 3. We can round correctly except when the last sh-1 bits + of q0 are 000..000 or 111..111 or 111..110. */ if (MPFR_LIKELY(((q0 + 2) & (mask >> 1)) > 2)) { rb = q0 & (MPFR_LIMB_ONE << (sh - 1)); - sb = 1; /* result cannot be exact when we can round with an approximation */ + sb = 1; /* result cannot be exact in this case */ } - else /* compute exact quotient and remainder */ -#endif + else /* the true quotient is rb, rb+1 or rb+2 */ { - udiv_qrnnd (q0, sb, u0, 0, v0); - sb |= q0 & extra; + mp_limb_t h, l; + q0 = rb; + umul_ppmm (h, l, q0, v0); + MPFR_ASSERTD(h < u0 || (h == u0 && l == MPFR_LIMB_ZERO)); + /* subtract {h,l} from {u0,0} */ + sub_ddmmss (h, l, u0, 0, h, l); + /* the remainder {h, l} should be < v0 */ + if (h || l >= v0) + { + q0 ++; + h -= (l < v0); + l -= v0; + } + if (h || l >= v0) + { + q0 ++; + h -= (l < v0); + l -= v0; + } + MPFR_ASSERTD(h == 0 && l < v0); + sb = l | (q0 & extra); q0 >>= extra; rb = q0 & (MPFR_LIMB_ONE << (sh - 1)); sb |= q0 & (mask >> 1); } +#else + udiv_qrnnd (q0, sb, u0, 0, v0); + sb |= q0 & extra; + q0 >>= extra; + rb = q0 & (MPFR_LIMB_ONE << (sh - 1)); + sb |= q0 & (mask >> 1); +#endif qp[0] = (MPFR_LIMB_HIGHBIT | q0) & ~mask; qx += extra; @@ -165,7 +271,12 @@ mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode) } else if (rnd_mode == MPFR_RNDN) { - if (rb == 0 || (sb == 0 && (qp[0] & (MPFR_LIMB_ONE << sh)) == 0)) + /* It is not possible to have rb <> 0 and sb = 0 here, since it would + mean a n-bit by n-bit division gives an exact (n+1)-bit number. + And since the case rb = sb = 0 was already dealt with, we cannot + have sb = 0. Thus we cannot be in the middle of two numbers. */ + MPFR_ASSERTD(sb != 0); + if (rb == 0) goto truncate; else goto add_one_ulp; @@ -194,7 +305,7 @@ mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode) } /* Special code for GMP_NUMB_BITS < PREC(q) < 2*GMP_NUMB_BITS and - GMP_NUMB_BITS < PREC(u), PREC(v) <= 2*GMP_NUMB_BITS */ + PREC(u) = PREC(v) = PREC(q) */ static int mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode) { @@ -220,10 +331,42 @@ mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode) /* we know q1*B+q0 is smaller or equal to the exact quotient, with difference at most 16 */ if (MPFR_LIKELY(((q0 + 16) & (mask >> 1)) > 16)) + sb = 1; /* result is not exact when we can round with an approximation */ + else { - sb = 1; /* result is not exact when we can round with an approximation */ - goto round_div2; + /* we know q1:q0 is a good-enough approximation, use it! */ + mp_limb_t qq[2], uu[4]; + + qq[0] = q0; + qq[1] = q1; + /* FIXME: instead of using mpn_mul_n, we can use 3 umul_ppmm calls, + since we know the difference should at most 16*(v1:v0) after the + subtraction below, thus at most 16*2^128. */ + mpn_mul_n (uu, qq, MPFR_MANT(v), 2); + /* we now should have uu[3]:uu[2] <= r3:r2 */ + MPFR_ASSERTD(uu[3] < r3 || (uu[3] == r3 && uu[2] <= r2)); + /* subtract {uu,4} from r3:r2:0:0, with result in {uu,4} */ + uu[2] = r2 - uu[2]; + uu[3] = r3 - uu[3] - (uu[2] > r2); + /* now negate uu[1]:uu[0] */ + uu[0] = -uu[0]; + uu[1] = -uu[1] - (uu[0] != 0); + /* there is a borrow in uu[2] when uu[0] and uu[1] are not both zero */ + uu[3] -= (uu[1] != 0 || uu[0] != 0) && (uu[2] == 0); + uu[2] -= (uu[1] != 0 || uu[0] != 0); + MPFR_ASSERTD(uu[3] == MPFR_LIMB_ZERO); + while (uu[2] > 0 || (uu[1] > v1) || (uu[1] == v1 && uu[0] >= v0)) + { + /* add 1 to q1:q0 */ + q0 ++; + q1 += (q0 == 0); + /* subtract v1:v0 to u2:u1:u0 */ + uu[2] -= (uu[1] < v1) || (uu[1] == v1 && uu[0] < v0); + sub_ddmmss (uu[1], uu[0], uu[1], uu[0], v1, v0); + } + sb = uu[1] | uu[0]; } + goto round_div2; #endif /* now r3:r2 < v1:v0 */ @@ -334,6 +477,9 @@ mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode) sb = r1 | r0; + /* here, q1:q0 should be an approximation of the quotient (or the exact + quotient), and sb the sticky bit */ + #if GMP_NUMB_BITS == 64 round_div2: #endif @@ -389,8 +535,9 @@ mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode) } else if (rnd_mode == MPFR_RNDN) { - if (rb == 0 || (rb && sb == 0 && - (qp[0] & (MPFR_LIMB_ONE << sh)) == 0)) + /* See the comment in mpfr_div_1. */ + MPFR_ASSERTD(sb != 0); + if (rb == 0) goto truncate; else goto add_one_ulp; @@ -729,20 +876,22 @@ mpfr_div (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode) } } - usize = MPFR_LIMB_SIZE(u); - vsize = MPFR_LIMB_SIZE(v); - /* When MPFR_GENERIC_ABI is defined, we don't use special code. */ #if !defined(MPFR_GENERIC_ABI) + if (MPFR_GET_PREC(u) == MPFR_GET_PREC(q) && + MPFR_GET_PREC(v) == MPFR_GET_PREC(q)) + { + if (MPFR_GET_PREC(q) < GMP_NUMB_BITS) + return mpfr_div_1 (q, u, v, rnd_mode); - if (MPFR_GET_PREC(q) < GMP_NUMB_BITS && usize == 1 && vsize == 1) - return mpfr_div_1 (q, u, v, rnd_mode); - - if (GMP_NUMB_BITS < MPFR_GET_PREC(q) && MPFR_GET_PREC(q) < 2 * GMP_NUMB_BITS - && usize == 2 && vsize == 2) + if (GMP_NUMB_BITS < MPFR_GET_PREC(q) && + MPFR_GET_PREC(q) < 2 * GMP_NUMB_BITS) return mpfr_div_2 (q, u, v, rnd_mode); + } #endif /* !defined(MPFR_GENERIC_ABI) */ + usize = MPFR_LIMB_SIZE(u); + vsize = MPFR_LIMB_SIZE(v); q0size = MPFR_LIMB_SIZE(q); /* number of limbs of destination */ q0p = MPFR_MANT(q); up = MPFR_MANT(u); diff --git a/src/frac.c b/src/frac.c index 46d4cddd7..f6bdd1bdd 100644 --- a/src/frac.c +++ b/src/frac.c @@ -108,7 +108,7 @@ mpfr_frac (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode) t0 = tn - un; tp = MPFR_MANT(t); if (sh == 0) - MPN_COPY_DECR(tp + t0, up, un + 1); + mpn_copyd (tp + t0, up, un + 1); else /* warning: un may be 0 here */ tp[tn] = k | ((un) ? mpn_lshift (tp + t0, up, un, sh) : MPFR_LIMB_ZERO); if (t0 > 0) diff --git a/src/mpfr-gmp.h b/src/mpfr-gmp.h index af30f5f7c..cad80346c 100644 --- a/src/mpfr-gmp.h +++ b/src/mpfr-gmp.h @@ -126,10 +126,6 @@ void *alloca (size_t); /* MP_LIMB macros */ #define MPN_ZERO(dst, n) memset((dst), 0, (n)*MPFR_BYTES_PER_MP_LIMB) -/* TODO: add MPFR_ASSERTD assertions for MPN_COPY_DECR and MPN_COPY_INCR, - even though memmove works with any overlapping. Useful to detect bugs! */ -#define MPN_COPY_DECR(dst,src,n) memmove((dst),(src),(n)*MPFR_BYTES_PER_MP_LIMB) -#define MPN_COPY_INCR(dst,src,n) memmove((dst),(src),(n)*MPFR_BYTES_PER_MP_LIMB) #define MPN_COPY(dst,src,n) \ do \ { \ @@ -454,12 +450,13 @@ typedef struct {mp_limb_t inv32;} mpfr_pi1_t; } while (0) #endif -/* mpn_copyd is a new exported function in GMP 5. - It existed in GMP 4 in the internal header, but still may not be - defined if HAVE_NATIVE_mpn_copyd is not defined */ +/* mpn_copyi and mpn_copyd are new exported functions in GMP 5. + Defining them to memmove works in overlap cases. */ #if !__MPFR_GMP(5,0,0) +# undef mpn_copyi +# define mpn_copyi memmove # undef mpn_copyd -# define mpn_copyd MPN_COPY +# define mpn_copyd memmove #endif /* The following macro is copied from GMP-6.1.1, file gmp-impl.h, diff --git a/src/mpfr-impl.h b/src/mpfr-impl.h index 5eef0c61c..88765d8a5 100644 --- a/src/mpfr-impl.h +++ b/src/mpfr-impl.h @@ -1443,6 +1443,9 @@ do { \ /* Size of an array, as a constant expression. */ #define numberof_const(x) (sizeof (x) / sizeof ((x)[0])) +/* Addition with carry (detected by GCC and other good compilers). */ +#define ADD_LIMB(u,v,c) ((u) += (v), (c) = (u) < (v)) + /****************************************************** ************ Save exponent/flags macros ************ diff --git a/src/rndna.c b/src/rndna.c index 839fa138b..11866eb3c 100644 --- a/src/rndna.c +++ b/src/rndna.c @@ -87,7 +87,7 @@ mpfr_round_nearest_away_begin (mpfr_t rop) ext = (mpfr_size_limb_extended_t *) (*__gmp_allocate_func) (MPFR_MALLOC_EXTENDED_SIZE(xsize)); - /* Save the context first */ + /* Save the context first. */ ext[ALLOC_SIZE].si = xsize; ext[OLD_MANTISSA].pi = MPFR_MANT(rop); ext[OLD_EXPONENT].ex = MPFR_EXP(rop); @@ -103,15 +103,20 @@ mpfr_round_nearest_away_begin (mpfr_t rop) MPFR_MANT(tmp) = (mp_limb_t*)(ext+MANTISSA); /* Set Mantissa ptr */ MPFR_SET_NAN(tmp); /* initializes to NaN */ - /* Copy rop into tmp now (rop may be used as input latter). */ + /* Note: This full initialization to NaN may be unnecessary because of + the mpfr_set just below, but this is cleaner in case internals would + change in the future (for instance, some fields of tmp could be read + before being set, and reading an uninitialized value is UB here). */ + + /* Copy rop into tmp now (rop may be used as input later). */ MPFR_DBGRES (inexact = mpfr_set(tmp, rop, MPFR_RNDN)); MPFR_ASSERTD(inexact == 0); /* Shall be exact since prec(tmp) > prec(rop) */ - /* Overwrite rop with tmp */ + /* Overwrite rop with tmp. */ rop[0] = tmp[0]; - /* rop now contains a copy of rop, with one extra bit of precision - while keeping the old rop "hidden" within its mantissa. */ + /* The new rop now contains a copy of the old rop, with one extra bit of + precision while keeping the old rop "hidden" within its mantissa. */ return; } diff --git a/src/round_raw_generic.c b/src/round_raw_generic.c index 8501b794a..66dc9ed4f 100644 --- a/src/round_raw_generic.c +++ b/src/round_raw_generic.c @@ -103,7 +103,7 @@ mpfr_round_raw_generic( if (MPFR_UNLIKELY(xprec <= yprec)) { /* No rounding is necessary. */ - /* if yp=xp, maybe an overlap: MPN_COPY_DECR is OK when src <= dst */ + /* if yp=xp, maybe an overlap: mpn_copyd is OK when src <= dst */ if (MPFR_LIKELY(rw)) nw++; MPFR_ASSERTD(nw >= 1); @@ -111,7 +111,7 @@ mpfr_round_raw_generic( if (use_inexp) *inexp = 0; #if flag == 0 - MPN_COPY_DECR(yp + (nw - xsize), xp, xsize); + mpn_copyd (yp + (nw - xsize), xp, xsize); MPN_ZERO(yp, nw - xsize); #endif return 0; @@ -159,7 +159,7 @@ mpfr_round_raw_generic( /* ((neg!=0)^(sb!=0)) ? MPFR_EVEN_INEX : -MPFR_EVEN_INEX */ /* since neg = 0 or 1 and sb = 0 */ #if flag == 0 - MPN_COPY_INCR(yp, xp + xsize - nw, nw); + mpn_copyi (yp, xp + xsize - nw, nw); yp[0] &= himask; #endif return 0; /* sb != 0 && rnd_mode != MPFR_RNDZ */ @@ -204,7 +204,7 @@ mpfr_round_raw_generic( /* ((neg != 0) ^ (rnd_mode != MPFR_RNDZ)) ? 1 : -1 */ *inexp = MPFR_UNLIKELY(sb == 0) ? 0 : (2*neg-1); #if flag == 0 - MPN_COPY_INCR(yp, xp + xsize - nw, nw); + mpn_copyi (yp, xp + xsize - nw, nw); yp[0] &= himask; #endif return 0; /* sb != 0 && rnd_mode != MPFR_RNDZ */ @@ -221,7 +221,7 @@ mpfr_round_raw_generic( /* ((neg != 0) ^ (rnd_mode != MPFR_RNDZ)) ? 1 : -1 */ *inexp = 0; #if flag == 0 - MPN_COPY_INCR(yp, xp + xsize - nw, nw); + mpn_copyi (yp, xp + xsize - nw, nw); yp[0] &= himask; #endif return 0; @@ -254,7 +254,7 @@ mpfr_round_raw_generic( } else himask = MPFR_LIMB_MAX; - MPN_COPY_INCR(yp, xp + xsize - nw, nw); + mpn_copyi (yp, xp + xsize - nw, nw); yp[0] &= himask; #endif return 0; diff --git a/src/set_uj.c b/src/set_uj.c index 50d77114a..fafc101c2 100644 --- a/src/set_uj.c +++ b/src/set_uj.c @@ -95,7 +95,7 @@ mpfr_set_uj_2exp (mpfr_t x, uintmax_t j, intmax_t e, mpfr_rnd_t rnd) if (MPFR_LIKELY (cnt != 0)) mpn_lshift (yp+len, yp, k, cnt); /* Normalize the high limb */ else if (len != 0) - MPN_COPY_DECR (yp+len, yp, k); /* Must use DECR */ + mpn_copyd (yp+len, yp, k); /* Must use copyd */ if (len != 0) { if (len == 1) @@ -119,28 +119,50 @@ mpfr_sqr_1 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p) /* Special code for GMP_NUMB_BITS < prec(a) < 2*GMP_NUMB_BITS and GMP_NUMB_BITS < prec(b) <= 2*GMP_NUMB_BITS. - Note: this function was copied from mpfr_mul_2 in file mul.c, thus any change - here should be done also in mpfr_mul_2. */ + Note: this function was copied and optimized from mpfr_mul_2 in file mul.c, + thus any change here should be done also in mpfr_mul_2, if applicable. */ static int mpfr_sqr_2 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p) { mp_limb_t h, l, u, v; mpfr_limb_ptr ap = MPFR_MANT(a); - mpfr_exp_t ax = MPFR_GET_EXP(b) * 2; + mpfr_exp_t ax = 2 * MPFR_GET_EXP(b); mpfr_prec_t sh = 2 * GMP_NUMB_BITS - p; mp_limb_t rb, sb, sb2, mask = MPFR_LIMB_MASK(sh); mp_limb_t *bp = MPFR_MANT(b); /* we store the 4-limb product in h=ap[1], l=ap[0], sb=ap[-1], sb2=ap[-2] */ umul_ppmm (h, l, bp[1], bp[1]); - umul_ppmm (sb, sb2, bp[0], bp[0]); umul_ppmm (u, v, bp[1], bp[0]); - add_ssaaaa (l, sb, l, sb, u, v); - /* warning: (l < u) is incorrect to detect a carry out of add_ssaaaa, since we - might have u = 111...111, a carry coming from sb+v, thus l = u */ - h += (l < u) || (l == u && sb < v); - add_ssaaaa (l, sb, l, sb, u, v); - h += (l < u) || (l == u && sb < v); + l += u << 1; + h += (l < (u << 1)) + (u >> (GMP_NUMB_BITS - 1)); + + /* now the full square is {h, l, 2*v + high(b0*c0), low(b0*c0)}, + where the lower part contributes to less than 3 ulps to {h, l} */ + + /* If h has its most significant bit set and the low sh-1 bits of l are not + 000...000 nor 111...111 nor 111...110, then we can round correctly; + if h has zero as most significant bit, we have to shift left h and l, + thus if the low sh-2 bits are not 000...000 nor 111...111 nor 111...110, + then we can round correctly. To avoid an extra test we consider the latter + case (if we can round, we can also round in the former case). + For sh <= 3, we have mask <= 7, thus (mask>>2) <= 1, and the approximation + cannot be enough. */ + if (MPFR_LIKELY(((l + 2) & (mask >> 2)) > 2)) + sb = sb2 = 1; /* result cannot be exact in that case */ + else + { + mp_limb_t carry1, carry2; + + umul_ppmm (sb, sb2, bp[0], bp[0]); + /* the full product is {h, l, sb + v + w, sb2} */ + ADD_LIMB (sb, v, carry1); + ADD_LIMB (l, carry1, carry2); + h += carry2; + ADD_LIMB (sb, v, carry1); + ADD_LIMB (l, carry1, carry2); + h += carry2; + } if (h < MPFR_LIMB_HIGHBIT) { ax --; diff --git a/src/sqrt.c b/src/sqrt.c index ee3175fc1..853cb360c 100644 --- a/src/sqrt.c +++ b/src/sqrt.c @@ -85,19 +85,33 @@ mpfr_sqrt1_approx (mp_limb_t n) } \ while (0) -/* Put in rp[1]*2^64+rp[0] an approximation of sqrt(2^128*n), +/* Put in rp[1]*2^64+rp[0] an approximation of floor(sqrt(2^128*n)), with 2^126 <= n := np[1]*2^64 + np[0] < 2^128. - The error on {rp, 2} is less than 43 ulps (in unknown direction). + The error on {rp, 2} is at most 27 ulps, with {rp, 2} <= floor(sqrt(2^128*n)). */ static void mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np) { - int i = np[1] >> 56; + int i; mp_limb_t x, r1, r0, h, l, t; const mp_limb_t *u; const mp_limb_t magic = 0xda9fbe76c8b43800; /* ceil(0.854*2^64) */ - x = np[1] << 8 | (np[0] >> 56); + /* We round x upward to ensure {np,2} >= r1^2 below. Since we consider also the upper 8 + bits of np[0] into x, we have to deal separately with the case np[1] = 2^64-1 and the + upper 8 bits of np[0] are all 1. */ + l = np[0] + MPFR_LIMB_MASK(56); + h = np[1] + (l < np[0]); + i = h >> 56; + x = (h << 8) | (l >> 56); + if (MPFR_UNLIKELY(i == 0)) + /* this happens when np[1] = 2^64-1, the upper 8 bits of np[0] are all 1, and the lower + 56 bits of np[0] are not all zero */ + { + MPFR_ASSERTD(x == MPFR_LIMB_ZERO); + goto compute_r1; + } + MPFR_ASSERTD(64 <= i && i < 256); u = V[i - 64]; umul_ppmm (h, l, u[8], x); /* the truncation error on h is at most 1 here */ @@ -116,12 +130,14 @@ mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np) truncation error on h+l/2^64 is <= 6/2^6 */ sub_ddmmss (h, l, u[0], u[1], h, l); /* Since the mathematical error is < 0.412e-19*2^64, the total error on - h + l/2^64 is less than 0.854; magic = ceil(0.854*2^64). */ + h + l/2^64 is less than 0.854; magic = ceil(0.854*2^64). We subtract it, + while keeping x >= 0. */ x = h - (l < magic && h != 0); - /* now 2^64 + x is an approximation of 2^96/sqrt(np[1]), - with 2^64 + x < 2^96/sqrt(np[1]) */ - + /* now 2^64 + x is an approximation of 2^96/sqrt(np[1]+1), + with 2^64 + x <= 2^96/sqrt(np[1]+1) */ + + compute_r1: umul_ppmm (r1, l, np[1], x); r1 += np[1]; @@ -132,11 +148,13 @@ mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np) r1 = MPFR_LIMB_HIGHBIT; umul_ppmm (h, l, r1, r1); + MPFR_ASSERTD(h < np[1] || (h == np[1] && l <= np[0])); sub_ddmmss (h, l, np[1], np[0], h, l); + /* divide by 2 */ l = (h << 63) | (l >> 1); h = h >> 1; - /* now add (2^64+x) * (h*2^64+l) / 2^64 to r0 */ + /* now add (2^64+x) * (h*2^64+l) / 2^64 to [r1*2^64, 0] */ umul_ppmm (r0, t, x, l); /* x * l */ r0 += l; @@ -147,16 +165,10 @@ mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np) r1 += (r0 < x); /* add x */ } - if (r1 & MPFR_LIMB_HIGHBIT) - { - rp[0] = r0; - rp[1] = r1; - } - else /* overflow occurred in r1 */ - { - rp[0] = ~MPFR_LIMB_ZERO; - rp[1] = ~MPFR_LIMB_ZERO; - } + MPFR_ASSERTD(r1 & MPFR_LIMB_HIGHBIT); + + rp[0] = r0; + rp[1] = r1; } /* Special code for prec(r), prec(u) < GMP_NUMB_BITS. We cannot have @@ -312,8 +324,10 @@ mpfr_sqrt2 (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode) mask = MPFR_LIMB_MASK(sh); mpfr_sqrt2_approx (rp, np + 2); - /* the error is less than 43 ulps on rp[0] */ - if (((rp[0] + 42) & (mask >> 1)) > 84) + /* the error is at most 27 ulps on rp[0], with {rp, 2} smaller or equal + to the exact square root, thus we can round correctly except when the + number formed by the last sh-1 bits of rp[0] is 0, -1, -2, ..., -27. */ + if (((rp[0] + 27) & (mask >> 1)) > 27) sb = 1; else { diff --git a/src/strtofr.c b/src/strtofr.c index efd5ef5ff..b1c62cda3 100644 --- a/src/strtofr.c +++ b/src/strtofr.c @@ -532,7 +532,7 @@ parsed_string_to_mpfr (mpfr_t x, struct parsed_string *pstr, mpfr_rnd_t rnd) if (real_ysize != ysize) { if (count == 0) - MPN_COPY_DECR (y + ysize - real_ysize, y, real_ysize); + mpn_copyd (y + ysize - real_ysize, y, real_ysize); MPN_ZERO (y, ysize - real_ysize); } /* for each bit shift decrease exponent of y */ diff --git a/src/sub1sp.c b/src/sub1sp.c index f9eee75dd..b469b25a5 100644 --- a/src/sub1sp.c +++ b/src/sub1sp.c @@ -620,6 +620,8 @@ mpfr_sub1sp3 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode, if (d < GMP_NUMB_BITS) { mp_limb_t cy; + /* warning: we must have the most significant bit of sb correct + since it might become the round bit below */ sb = cp[0] << (GMP_NUMB_BITS - d); /* neglected part of c */ a0 = bp[0] - ((cp[1] << (GMP_NUMB_BITS - d)) | (cp[0] >> d)); a1 = bp[1] - ((cp[2] << (GMP_NUMB_BITS - d)) | (cp[1] >> d)) @@ -681,6 +683,8 @@ mpfr_sub1sp3 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode, else if (d < 2 * GMP_NUMB_BITS) { mp_limb_t c0shifted; + /* warning: we must have the most significant bit of sb correct + since it might become the round bit below */ sb = (d == GMP_NUMB_BITS) ? cp[0] : (cp[1] << (2*GMP_NUMB_BITS - d)) | (cp[0] != 0); c0shifted = (d == GMP_NUMB_BITS) ? cp[1] @@ -980,8 +984,8 @@ mpfr_sub1sp (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode) mpn_lshift (ap + len, ap, k, cnt); /* Normalize the High Limb*/ else { - /* Must use DECR since src and dest may overlap & dest>=src*/ - MPN_COPY_DECR(ap+len, ap, k); + /* Must use copyd since src and dst may overlap & dst>=src */ + mpn_copyd (ap+len, ap, k); } MPN_ZERO(ap, len); /* Zeroing the last limbs */ bx -= cnt + len*GMP_NUMB_BITS; /* Update Expo */ @@ -473,7 +473,7 @@ sum_raw (mp_limb_t *wp, mp_size_t ws, mpfr_prec_t wq, mpfr_ptr *const x, if (MPFR_LIKELY (shiftc != 0)) mpn_lshift (wp + shifts, wp, ws - shifts, shiftc); else - MPN_COPY_DECR (wp + shifts, wp, ws - shifts); + mpn_copyd (wp + shifts, wp, ws - shifts); MPN_ZERO (wp, shifts); /* Compute minexp = minexp - shiftq safely. */ SAFE_SUB (minexp, minexp, shiftq); |