summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorvlefevre <vlefevre@280ebfd0-de03-0410-8827-d642c229c3f4>2017-05-04 07:46:57 +0000
committervlefevre <vlefevre@280ebfd0-de03-0410-8827-d642c229c3f4>2017-05-04 07:46:57 +0000
commit6de18e36a603b2b4c3f065119de5d5fd78367b51 (patch)
tree333e193c333ea5369d4fcd17e5fbb469dd50c40d /src
parentc64ffc2f9958b4fa2eab676fee377ef0fbcaddba (diff)
downloadmpfr-6de18e36a603b2b4c3f065119de5d5fd78367b51.tar.gz
Merged r11179-11196 from the trunk (no conflicts).
git-svn-id: svn://scm.gforge.inria.fr/svn/mpfr/branches/faithful@11453 280ebfd0-de03-0410-8827-d642c229c3f4
Diffstat (limited to 'src')
-rw-r--r--src/add1sp.c22
-rw-r--r--src/div.c205
-rw-r--r--src/frac.c2
-rw-r--r--src/mpfr-gmp.h13
-rw-r--r--src/mpfr-impl.h3
-rw-r--r--src/rndna.c15
-rw-r--r--src/round_raw_generic.c12
-rw-r--r--src/set_uj.c2
-rw-r--r--src/sqr.c42
-rw-r--r--src/sqrt.c56
-rw-r--r--src/strtofr.c2
-rw-r--r--src/sub1sp.c8
-rw-r--r--src/sum.c2
13 files changed, 291 insertions, 93 deletions
diff --git a/src/add1sp.c b/src/add1sp.c
index 03a8ba929..4182cfcf1 100644
--- a/src/add1sp.c
+++ b/src/add1sp.c
@@ -123,6 +123,8 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
if (bx == cx)
{
+ /* TODO: a0 = (bp[0] >> 1) + (cp[0] >> 1) is probably better
+ (no long constant to load in a register). */
/* since bp[0], cp[0] >= MPFR_LIMB_HIGHBIT, a carry always occurs */
a0 = MPFR_LIMB_HIGHBIT | ((bp[0] + cp[0]) >> 1);
bx ++;
@@ -135,14 +137,17 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
BGreater1:
d = (mpfr_uexp_t) bx - cx;
mask = MPFR_LIMB_MASK(sh);
+ /* TODO: Should the case d < sh be removed, i.e. seen as a particular
+ case of d < GMP_NUMB_BITS? This case would do a bit more operations
+ but a test would be removed, avoiding pipeline stall issues. */
if (d < sh)
{
/* we can shift c by d bits to the right without losing any bit,
moreover we can shift one more if there is an exponent increase */
- rb = bp[0];
- a0 = rb + (cp[0] >> d);
- if (a0 < rb) /* carry */
+ a0 = bp[0] + (cp[0] >> d);
+ if (a0 < bp[0]) /* carry */
{
+ MPFR_ASSERTD ((a0 & 1) == 0);
a0 = MPFR_LIMB_HIGHBIT | (a0 >> 1);
bx ++;
}
@@ -152,10 +157,9 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
}
else if (d < GMP_NUMB_BITS) /* sh <= d < GMP_NUMB_BITS */
{
- rb = bp[0];
sb = cp[0] << (GMP_NUMB_BITS - d); /* bits from cp[-1] after shift */
- a0 = rb + (cp[0] >> d);
- if (a0 < rb)
+ a0 = bp[0] + (cp[0] >> d);
+ if (a0 < bp[0]) /* carry */
{
sb |= a0 & 1;
a0 = MPFR_LIMB_HIGHBIT | (a0 >> 1);
@@ -181,9 +185,9 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
goto BGreater1;
}
- /* Note: we could keep the output significand in a0 for the rounding, and only
- store it in ap[0] at the very end, but this seems slower on average (but better
- for the worst case). */
+ /* Note: we could keep the output significand in a0 for the rounding,
+ and only store it in ap[0] at the very end, but this seems slower
+ on average (but better for the worst case). */
/* now perform rounding */
if (MPFR_UNLIKELY(bx > __gmpfr_emax))
diff --git a/src/div.c b/src/div.c
index 138f6bb32..d628b9e1b 100644
--- a/src/div.c
+++ b/src/div.c
@@ -37,6 +37,7 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
#include "invert_limb.h"
+#if 1
/* Given u = u1*B+u0 < d = d1*B+d0 with d normalized (high bit of d1 set),
put in v = v1*B+v0 an approximation of floor(u*B^2/d), with:
B = 2^GMP_NUMB_BITS and v <= floor(u*B^2/d) <= v + 16.
@@ -78,11 +79,87 @@ mpfr_div2_approx (mpfr_limb_ptr v1, mpfr_limb_ptr v0,
*v1 += 1 + (*v0 < x);
}
}
+#else
+/* the following is some experimental code, not fully proven correct,
+ and which does not seem to be faster, unless we find a way to speed up
+ the while loop (if 'while' is replaced by 'if' it becomes faster than
+ the above code, but is then wrong) */
+static void
+mpfr_div2_approx (mpfr_limb_ptr Q1, mpfr_limb_ptr Q0,
+ mp_limb_t u1, mp_limb_t u0,
+ mp_limb_t v1, mp_limb_t v0)
+{
+ mp_limb_t inv, q1, q0, r2, r1, r0, cy;
+
+ /* first compute an approximation of q1 */
+ if (MPFR_UNLIKELY(v1 == MPFR_LIMB_MAX))
+ inv = MPFR_LIMB_ZERO;
+ else
+ __gmpfr_invert_limb_approx (inv, v1 + 1);
+ /* now inv <= B^2/(v1+1) - B */
+ umul_ppmm (q1, q0, u1, inv);
+ q1 += u1;
+
+ /* we have q1 <= floor(u1*2^GMP_NUMB_BITS/v1) <= q1 + 2 */
+
+ umul_ppmm (r2, r1, q1, v1);
+ umul_ppmm (cy, r0, q1, v0);
+ r1 += cy;
+ r2 += (r1 < cy);
+
+ /* Now r2:r1:r0 = q1 * v1:v0. While q1*v1 <= u1:0, it might be that
+ r2:r1 > u1:u0. First subtract r2:r1:r0 from u1:u0:0, with result
+ in r2:r1:r0. */
+ r0 = -r0;
+ r1 = u0 - r1 - (r0 != 0);
+ r2 = u1 - r2 - (r1 > u0 || (r1 == u0 && r0 != 0));
+
+ /* u1:u0:0 - q1 * v1:v0 >= u1*B^2 - q1*(v1*B+B-1) >= -q1*B > -B^2
+ thus r2 can be 0 or -1 here. */
+ if (r2 & MPFR_LIMB_HIGHBIT) /* correction performed at most two times */
+ {
+ ADD_LIMB(r0, v0, cy); /* r0 += v0; cy = r0 < v0 */
+ ADD_LIMB(r1, v1, cy);
+ r2 += cy;
+ q1 --;
+ if (r2 & MPFR_LIMB_HIGHBIT)
+ {
+ ADD_LIMB(r0, v0, cy); /* r0 += v0; cy = r0 < v0 */
+ ADD_LIMB(r1, v1, cy);
+ r2 += cy;
+ q1 --;
+ }
+ }
+ else
+ {
+ /* experimentally, the number of loops is <= 5 */
+ while (r2 || r1 > v1 || (r1 == v1 && r0 >= v0))
+ {
+ r2 -= (r1 < v1 || (r1 == v1 && r0 < v0));
+ r1 -= v1 + (r0 < v0);
+ r0 -= v0;
+ q1 ++;
+ }
+ }
+
+ /* now u1:u0:0 = q1 * d1:d0 + r1:r0, with 0 <= r1:r0 < d1:d0 */
+ MPFR_ASSERTD(r2 == MPFR_LIMB_ZERO);
+ MPFR_ASSERTD(r1 < v1 || (r1 == v1 && r0 < v0));
+
+ /* the second quotient limb is approximated by r1*B / d1,
+ which is itself approximated by r1+(r1*inv/B) */
+ umul_ppmm (q0, r0, r1, inv);
+
+ *Q1 = q1;
+ *Q0 = r1 + q0;
+
+ /* experimentally: q1:q0 <= floor(B^2*u/v) <= q1:q0 + 5 */
+}
+#endif
#endif /* GMP_NUMB_BITS == 64 */
-/* Special code for p=PREC(q) < GMP_NUMB_BITS,
- and PREC(u), PREC(v) <= GMP_NUMB_BITS */
+/* Special code for PREC(q) = PREC(u) = PREC(v) = p < GMP_NUMB_BITS */
static int
mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
{
@@ -99,31 +176,60 @@ mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
u0 -= v0;
#if GMP_NUMB_BITS == 64 /* __gmpfr_invert_limb_approx only exists for 64-bit */
- /* first try with an approximate quotient */
+ /* First try with an approximate quotient.
+ FIXME: for p<=62 we have sh-1<2 and will never be able to round correctly.
+ Even for p=61 we have sh-1=2 and we can round correctly only when the two
+ last bist of q0 are 01, which happens with probability 25% only. */
{
mp_limb_t inv;
__gmpfr_invert_limb_approx (inv, v0);
- umul_ppmm (q0, sb, u0, inv);
+ umul_ppmm (rb, sb, u0, inv);
}
- q0 = (q0 + u0) >> extra;
- /* before the >> extra shift, q0 + u0 does not exceed the true quotient
- floor(u'0*2^GMP_NUMB_BITS/v0), with error at most 2, which means the rational
- quotient q satisfies q0 + u0 <= q < q0 + u0 + 3. We can round correctly except
- when the last sh-1 bits of q0 are 000..000 or 111..111 or 111..110. */
+ rb += u0;
+ q0 = rb >> extra;
+ /* rb does not exceed the true quotient floor(u0*2^GMP_NUMB_BITS/v0),
+ with error at most 2, which means the rational quotient q satisfies
+ rb <= q < rb + 3. We can round correctly except when the last sh-1 bits
+ of q0 are 000..000 or 111..111 or 111..110. */
if (MPFR_LIKELY(((q0 + 2) & (mask >> 1)) > 2))
{
rb = q0 & (MPFR_LIMB_ONE << (sh - 1));
- sb = 1; /* result cannot be exact when we can round with an approximation */
+ sb = 1; /* result cannot be exact in this case */
}
- else /* compute exact quotient and remainder */
-#endif
+ else /* the true quotient is rb, rb+1 or rb+2 */
{
- udiv_qrnnd (q0, sb, u0, 0, v0);
- sb |= q0 & extra;
+ mp_limb_t h, l;
+ q0 = rb;
+ umul_ppmm (h, l, q0, v0);
+ MPFR_ASSERTD(h < u0 || (h == u0 && l == MPFR_LIMB_ZERO));
+ /* subtract {h,l} from {u0,0} */
+ sub_ddmmss (h, l, u0, 0, h, l);
+ /* the remainder {h, l} should be < v0 */
+ if (h || l >= v0)
+ {
+ q0 ++;
+ h -= (l < v0);
+ l -= v0;
+ }
+ if (h || l >= v0)
+ {
+ q0 ++;
+ h -= (l < v0);
+ l -= v0;
+ }
+ MPFR_ASSERTD(h == 0 && l < v0);
+ sb = l | (q0 & extra);
q0 >>= extra;
rb = q0 & (MPFR_LIMB_ONE << (sh - 1));
sb |= q0 & (mask >> 1);
}
+#else
+ udiv_qrnnd (q0, sb, u0, 0, v0);
+ sb |= q0 & extra;
+ q0 >>= extra;
+ rb = q0 & (MPFR_LIMB_ONE << (sh - 1));
+ sb |= q0 & (mask >> 1);
+#endif
qp[0] = (MPFR_LIMB_HIGHBIT | q0) & ~mask;
qx += extra;
@@ -165,7 +271,12 @@ mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
}
else if (rnd_mode == MPFR_RNDN)
{
- if (rb == 0 || (sb == 0 && (qp[0] & (MPFR_LIMB_ONE << sh)) == 0))
+ /* It is not possible to have rb <> 0 and sb = 0 here, since it would
+ mean a n-bit by n-bit division gives an exact (n+1)-bit number.
+ And since the case rb = sb = 0 was already dealt with, we cannot
+ have sb = 0. Thus we cannot be in the middle of two numbers. */
+ MPFR_ASSERTD(sb != 0);
+ if (rb == 0)
goto truncate;
else
goto add_one_ulp;
@@ -194,7 +305,7 @@ mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
}
/* Special code for GMP_NUMB_BITS < PREC(q) < 2*GMP_NUMB_BITS and
- GMP_NUMB_BITS < PREC(u), PREC(v) <= 2*GMP_NUMB_BITS */
+ PREC(u) = PREC(v) = PREC(q) */
static int
mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
{
@@ -220,10 +331,42 @@ mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
/* we know q1*B+q0 is smaller or equal to the exact quotient, with
difference at most 16 */
if (MPFR_LIKELY(((q0 + 16) & (mask >> 1)) > 16))
+ sb = 1; /* result is not exact when we can round with an approximation */
+ else
{
- sb = 1; /* result is not exact when we can round with an approximation */
- goto round_div2;
+ /* we know q1:q0 is a good-enough approximation, use it! */
+ mp_limb_t qq[2], uu[4];
+
+ qq[0] = q0;
+ qq[1] = q1;
+ /* FIXME: instead of using mpn_mul_n, we can use 3 umul_ppmm calls,
+ since we know the difference should at most 16*(v1:v0) after the
+ subtraction below, thus at most 16*2^128. */
+ mpn_mul_n (uu, qq, MPFR_MANT(v), 2);
+ /* we now should have uu[3]:uu[2] <= r3:r2 */
+ MPFR_ASSERTD(uu[3] < r3 || (uu[3] == r3 && uu[2] <= r2));
+ /* subtract {uu,4} from r3:r2:0:0, with result in {uu,4} */
+ uu[2] = r2 - uu[2];
+ uu[3] = r3 - uu[3] - (uu[2] > r2);
+ /* now negate uu[1]:uu[0] */
+ uu[0] = -uu[0];
+ uu[1] = -uu[1] - (uu[0] != 0);
+ /* there is a borrow in uu[2] when uu[0] and uu[1] are not both zero */
+ uu[3] -= (uu[1] != 0 || uu[0] != 0) && (uu[2] == 0);
+ uu[2] -= (uu[1] != 0 || uu[0] != 0);
+ MPFR_ASSERTD(uu[3] == MPFR_LIMB_ZERO);
+ while (uu[2] > 0 || (uu[1] > v1) || (uu[1] == v1 && uu[0] >= v0))
+ {
+ /* add 1 to q1:q0 */
+ q0 ++;
+ q1 += (q0 == 0);
+ /* subtract v1:v0 to u2:u1:u0 */
+ uu[2] -= (uu[1] < v1) || (uu[1] == v1 && uu[0] < v0);
+ sub_ddmmss (uu[1], uu[0], uu[1], uu[0], v1, v0);
+ }
+ sb = uu[1] | uu[0];
}
+ goto round_div2;
#endif
/* now r3:r2 < v1:v0 */
@@ -334,6 +477,9 @@ mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
sb = r1 | r0;
+ /* here, q1:q0 should be an approximation of the quotient (or the exact
+ quotient), and sb the sticky bit */
+
#if GMP_NUMB_BITS == 64
round_div2:
#endif
@@ -389,8 +535,9 @@ mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
}
else if (rnd_mode == MPFR_RNDN)
{
- if (rb == 0 || (rb && sb == 0 &&
- (qp[0] & (MPFR_LIMB_ONE << sh)) == 0))
+ /* See the comment in mpfr_div_1. */
+ MPFR_ASSERTD(sb != 0);
+ if (rb == 0)
goto truncate;
else
goto add_one_ulp;
@@ -729,20 +876,22 @@ mpfr_div (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
}
}
- usize = MPFR_LIMB_SIZE(u);
- vsize = MPFR_LIMB_SIZE(v);
-
/* When MPFR_GENERIC_ABI is defined, we don't use special code. */
#if !defined(MPFR_GENERIC_ABI)
+ if (MPFR_GET_PREC(u) == MPFR_GET_PREC(q) &&
+ MPFR_GET_PREC(v) == MPFR_GET_PREC(q))
+ {
+ if (MPFR_GET_PREC(q) < GMP_NUMB_BITS)
+ return mpfr_div_1 (q, u, v, rnd_mode);
- if (MPFR_GET_PREC(q) < GMP_NUMB_BITS && usize == 1 && vsize == 1)
- return mpfr_div_1 (q, u, v, rnd_mode);
-
- if (GMP_NUMB_BITS < MPFR_GET_PREC(q) && MPFR_GET_PREC(q) < 2 * GMP_NUMB_BITS
- && usize == 2 && vsize == 2)
+ if (GMP_NUMB_BITS < MPFR_GET_PREC(q) &&
+ MPFR_GET_PREC(q) < 2 * GMP_NUMB_BITS)
return mpfr_div_2 (q, u, v, rnd_mode);
+ }
#endif /* !defined(MPFR_GENERIC_ABI) */
+ usize = MPFR_LIMB_SIZE(u);
+ vsize = MPFR_LIMB_SIZE(v);
q0size = MPFR_LIMB_SIZE(q); /* number of limbs of destination */
q0p = MPFR_MANT(q);
up = MPFR_MANT(u);
diff --git a/src/frac.c b/src/frac.c
index 46d4cddd7..f6bdd1bdd 100644
--- a/src/frac.c
+++ b/src/frac.c
@@ -108,7 +108,7 @@ mpfr_frac (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
t0 = tn - un;
tp = MPFR_MANT(t);
if (sh == 0)
- MPN_COPY_DECR(tp + t0, up, un + 1);
+ mpn_copyd (tp + t0, up, un + 1);
else /* warning: un may be 0 here */
tp[tn] = k | ((un) ? mpn_lshift (tp + t0, up, un, sh) : MPFR_LIMB_ZERO);
if (t0 > 0)
diff --git a/src/mpfr-gmp.h b/src/mpfr-gmp.h
index af30f5f7c..cad80346c 100644
--- a/src/mpfr-gmp.h
+++ b/src/mpfr-gmp.h
@@ -126,10 +126,6 @@ void *alloca (size_t);
/* MP_LIMB macros */
#define MPN_ZERO(dst, n) memset((dst), 0, (n)*MPFR_BYTES_PER_MP_LIMB)
-/* TODO: add MPFR_ASSERTD assertions for MPN_COPY_DECR and MPN_COPY_INCR,
- even though memmove works with any overlapping. Useful to detect bugs! */
-#define MPN_COPY_DECR(dst,src,n) memmove((dst),(src),(n)*MPFR_BYTES_PER_MP_LIMB)
-#define MPN_COPY_INCR(dst,src,n) memmove((dst),(src),(n)*MPFR_BYTES_PER_MP_LIMB)
#define MPN_COPY(dst,src,n) \
do \
{ \
@@ -454,12 +450,13 @@ typedef struct {mp_limb_t inv32;} mpfr_pi1_t;
} while (0)
#endif
-/* mpn_copyd is a new exported function in GMP 5.
- It existed in GMP 4 in the internal header, but still may not be
- defined if HAVE_NATIVE_mpn_copyd is not defined */
+/* mpn_copyi and mpn_copyd are new exported functions in GMP 5.
+ Defining them to memmove works in overlap cases. */
#if !__MPFR_GMP(5,0,0)
+# undef mpn_copyi
+# define mpn_copyi memmove
# undef mpn_copyd
-# define mpn_copyd MPN_COPY
+# define mpn_copyd memmove
#endif
/* The following macro is copied from GMP-6.1.1, file gmp-impl.h,
diff --git a/src/mpfr-impl.h b/src/mpfr-impl.h
index 5eef0c61c..88765d8a5 100644
--- a/src/mpfr-impl.h
+++ b/src/mpfr-impl.h
@@ -1443,6 +1443,9 @@ do { \
/* Size of an array, as a constant expression. */
#define numberof_const(x) (sizeof (x) / sizeof ((x)[0]))
+/* Addition with carry (detected by GCC and other good compilers). */
+#define ADD_LIMB(u,v,c) ((u) += (v), (c) = (u) < (v))
+
/******************************************************
************ Save exponent/flags macros ************
diff --git a/src/rndna.c b/src/rndna.c
index 839fa138b..11866eb3c 100644
--- a/src/rndna.c
+++ b/src/rndna.c
@@ -87,7 +87,7 @@ mpfr_round_nearest_away_begin (mpfr_t rop)
ext = (mpfr_size_limb_extended_t *)
(*__gmp_allocate_func) (MPFR_MALLOC_EXTENDED_SIZE(xsize));
- /* Save the context first */
+ /* Save the context first. */
ext[ALLOC_SIZE].si = xsize;
ext[OLD_MANTISSA].pi = MPFR_MANT(rop);
ext[OLD_EXPONENT].ex = MPFR_EXP(rop);
@@ -103,15 +103,20 @@ mpfr_round_nearest_away_begin (mpfr_t rop)
MPFR_MANT(tmp) = (mp_limb_t*)(ext+MANTISSA); /* Set Mantissa ptr */
MPFR_SET_NAN(tmp); /* initializes to NaN */
- /* Copy rop into tmp now (rop may be used as input latter). */
+ /* Note: This full initialization to NaN may be unnecessary because of
+ the mpfr_set just below, but this is cleaner in case internals would
+ change in the future (for instance, some fields of tmp could be read
+ before being set, and reading an uninitialized value is UB here). */
+
+ /* Copy rop into tmp now (rop may be used as input later). */
MPFR_DBGRES (inexact = mpfr_set(tmp, rop, MPFR_RNDN));
MPFR_ASSERTD(inexact == 0); /* Shall be exact since prec(tmp) > prec(rop) */
- /* Overwrite rop with tmp */
+ /* Overwrite rop with tmp. */
rop[0] = tmp[0];
- /* rop now contains a copy of rop, with one extra bit of precision
- while keeping the old rop "hidden" within its mantissa. */
+ /* The new rop now contains a copy of the old rop, with one extra bit of
+ precision while keeping the old rop "hidden" within its mantissa. */
return;
}
diff --git a/src/round_raw_generic.c b/src/round_raw_generic.c
index 8501b794a..66dc9ed4f 100644
--- a/src/round_raw_generic.c
+++ b/src/round_raw_generic.c
@@ -103,7 +103,7 @@ mpfr_round_raw_generic(
if (MPFR_UNLIKELY(xprec <= yprec))
{ /* No rounding is necessary. */
- /* if yp=xp, maybe an overlap: MPN_COPY_DECR is OK when src <= dst */
+ /* if yp=xp, maybe an overlap: mpn_copyd is OK when src <= dst */
if (MPFR_LIKELY(rw))
nw++;
MPFR_ASSERTD(nw >= 1);
@@ -111,7 +111,7 @@ mpfr_round_raw_generic(
if (use_inexp)
*inexp = 0;
#if flag == 0
- MPN_COPY_DECR(yp + (nw - xsize), xp, xsize);
+ mpn_copyd (yp + (nw - xsize), xp, xsize);
MPN_ZERO(yp, nw - xsize);
#endif
return 0;
@@ -159,7 +159,7 @@ mpfr_round_raw_generic(
/* ((neg!=0)^(sb!=0)) ? MPFR_EVEN_INEX : -MPFR_EVEN_INEX */
/* since neg = 0 or 1 and sb = 0 */
#if flag == 0
- MPN_COPY_INCR(yp, xp + xsize - nw, nw);
+ mpn_copyi (yp, xp + xsize - nw, nw);
yp[0] &= himask;
#endif
return 0; /* sb != 0 && rnd_mode != MPFR_RNDZ */
@@ -204,7 +204,7 @@ mpfr_round_raw_generic(
/* ((neg != 0) ^ (rnd_mode != MPFR_RNDZ)) ? 1 : -1 */
*inexp = MPFR_UNLIKELY(sb == 0) ? 0 : (2*neg-1);
#if flag == 0
- MPN_COPY_INCR(yp, xp + xsize - nw, nw);
+ mpn_copyi (yp, xp + xsize - nw, nw);
yp[0] &= himask;
#endif
return 0; /* sb != 0 && rnd_mode != MPFR_RNDZ */
@@ -221,7 +221,7 @@ mpfr_round_raw_generic(
/* ((neg != 0) ^ (rnd_mode != MPFR_RNDZ)) ? 1 : -1 */
*inexp = 0;
#if flag == 0
- MPN_COPY_INCR(yp, xp + xsize - nw, nw);
+ mpn_copyi (yp, xp + xsize - nw, nw);
yp[0] &= himask;
#endif
return 0;
@@ -254,7 +254,7 @@ mpfr_round_raw_generic(
}
else
himask = MPFR_LIMB_MAX;
- MPN_COPY_INCR(yp, xp + xsize - nw, nw);
+ mpn_copyi (yp, xp + xsize - nw, nw);
yp[0] &= himask;
#endif
return 0;
diff --git a/src/set_uj.c b/src/set_uj.c
index 50d77114a..fafc101c2 100644
--- a/src/set_uj.c
+++ b/src/set_uj.c
@@ -95,7 +95,7 @@ mpfr_set_uj_2exp (mpfr_t x, uintmax_t j, intmax_t e, mpfr_rnd_t rnd)
if (MPFR_LIKELY (cnt != 0))
mpn_lshift (yp+len, yp, k, cnt); /* Normalize the high limb */
else if (len != 0)
- MPN_COPY_DECR (yp+len, yp, k); /* Must use DECR */
+ mpn_copyd (yp+len, yp, k); /* Must use copyd */
if (len != 0)
{
if (len == 1)
diff --git a/src/sqr.c b/src/sqr.c
index 3cd826567..d5c1570e9 100644
--- a/src/sqr.c
+++ b/src/sqr.c
@@ -119,28 +119,50 @@ mpfr_sqr_1 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
/* Special code for GMP_NUMB_BITS < prec(a) < 2*GMP_NUMB_BITS and
GMP_NUMB_BITS < prec(b) <= 2*GMP_NUMB_BITS.
- Note: this function was copied from mpfr_mul_2 in file mul.c, thus any change
- here should be done also in mpfr_mul_2. */
+ Note: this function was copied and optimized from mpfr_mul_2 in file mul.c,
+ thus any change here should be done also in mpfr_mul_2, if applicable. */
static int
mpfr_sqr_2 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
{
mp_limb_t h, l, u, v;
mpfr_limb_ptr ap = MPFR_MANT(a);
- mpfr_exp_t ax = MPFR_GET_EXP(b) * 2;
+ mpfr_exp_t ax = 2 * MPFR_GET_EXP(b);
mpfr_prec_t sh = 2 * GMP_NUMB_BITS - p;
mp_limb_t rb, sb, sb2, mask = MPFR_LIMB_MASK(sh);
mp_limb_t *bp = MPFR_MANT(b);
/* we store the 4-limb product in h=ap[1], l=ap[0], sb=ap[-1], sb2=ap[-2] */
umul_ppmm (h, l, bp[1], bp[1]);
- umul_ppmm (sb, sb2, bp[0], bp[0]);
umul_ppmm (u, v, bp[1], bp[0]);
- add_ssaaaa (l, sb, l, sb, u, v);
- /* warning: (l < u) is incorrect to detect a carry out of add_ssaaaa, since we
- might have u = 111...111, a carry coming from sb+v, thus l = u */
- h += (l < u) || (l == u && sb < v);
- add_ssaaaa (l, sb, l, sb, u, v);
- h += (l < u) || (l == u && sb < v);
+ l += u << 1;
+ h += (l < (u << 1)) + (u >> (GMP_NUMB_BITS - 1));
+
+ /* now the full square is {h, l, 2*v + high(b0*c0), low(b0*c0)},
+ where the lower part contributes to less than 3 ulps to {h, l} */
+
+ /* If h has its most significant bit set and the low sh-1 bits of l are not
+ 000...000 nor 111...111 nor 111...110, then we can round correctly;
+ if h has zero as most significant bit, we have to shift left h and l,
+ thus if the low sh-2 bits are not 000...000 nor 111...111 nor 111...110,
+ then we can round correctly. To avoid an extra test we consider the latter
+ case (if we can round, we can also round in the former case).
+ For sh <= 3, we have mask <= 7, thus (mask>>2) <= 1, and the approximation
+ cannot be enough. */
+ if (MPFR_LIKELY(((l + 2) & (mask >> 2)) > 2))
+ sb = sb2 = 1; /* result cannot be exact in that case */
+ else
+ {
+ mp_limb_t carry1, carry2;
+
+ umul_ppmm (sb, sb2, bp[0], bp[0]);
+ /* the full product is {h, l, sb + v + w, sb2} */
+ ADD_LIMB (sb, v, carry1);
+ ADD_LIMB (l, carry1, carry2);
+ h += carry2;
+ ADD_LIMB (sb, v, carry1);
+ ADD_LIMB (l, carry1, carry2);
+ h += carry2;
+ }
if (h < MPFR_LIMB_HIGHBIT)
{
ax --;
diff --git a/src/sqrt.c b/src/sqrt.c
index ee3175fc1..853cb360c 100644
--- a/src/sqrt.c
+++ b/src/sqrt.c
@@ -85,19 +85,33 @@ mpfr_sqrt1_approx (mp_limb_t n)
} \
while (0)
-/* Put in rp[1]*2^64+rp[0] an approximation of sqrt(2^128*n),
+/* Put in rp[1]*2^64+rp[0] an approximation of floor(sqrt(2^128*n)),
with 2^126 <= n := np[1]*2^64 + np[0] < 2^128.
- The error on {rp, 2} is less than 43 ulps (in unknown direction).
+ The error on {rp, 2} is at most 27 ulps, with {rp, 2} <= floor(sqrt(2^128*n)).
*/
static void
mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np)
{
- int i = np[1] >> 56;
+ int i;
mp_limb_t x, r1, r0, h, l, t;
const mp_limb_t *u;
const mp_limb_t magic = 0xda9fbe76c8b43800; /* ceil(0.854*2^64) */
- x = np[1] << 8 | (np[0] >> 56);
+ /* We round x upward to ensure {np,2} >= r1^2 below. Since we consider also the upper 8
+ bits of np[0] into x, we have to deal separately with the case np[1] = 2^64-1 and the
+ upper 8 bits of np[0] are all 1. */
+ l = np[0] + MPFR_LIMB_MASK(56);
+ h = np[1] + (l < np[0]);
+ i = h >> 56;
+ x = (h << 8) | (l >> 56);
+ if (MPFR_UNLIKELY(i == 0))
+ /* this happens when np[1] = 2^64-1, the upper 8 bits of np[0] are all 1, and the lower
+ 56 bits of np[0] are not all zero */
+ {
+ MPFR_ASSERTD(x == MPFR_LIMB_ZERO);
+ goto compute_r1;
+ }
+ MPFR_ASSERTD(64 <= i && i < 256);
u = V[i - 64];
umul_ppmm (h, l, u[8], x);
/* the truncation error on h is at most 1 here */
@@ -116,12 +130,14 @@ mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np)
truncation error on h+l/2^64 is <= 6/2^6 */
sub_ddmmss (h, l, u[0], u[1], h, l);
/* Since the mathematical error is < 0.412e-19*2^64, the total error on
- h + l/2^64 is less than 0.854; magic = ceil(0.854*2^64). */
+ h + l/2^64 is less than 0.854; magic = ceil(0.854*2^64). We subtract it,
+ while keeping x >= 0. */
x = h - (l < magic && h != 0);
- /* now 2^64 + x is an approximation of 2^96/sqrt(np[1]),
- with 2^64 + x < 2^96/sqrt(np[1]) */
-
+ /* now 2^64 + x is an approximation of 2^96/sqrt(np[1]+1),
+ with 2^64 + x <= 2^96/sqrt(np[1]+1) */
+
+ compute_r1:
umul_ppmm (r1, l, np[1], x);
r1 += np[1];
@@ -132,11 +148,13 @@ mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np)
r1 = MPFR_LIMB_HIGHBIT;
umul_ppmm (h, l, r1, r1);
+ MPFR_ASSERTD(h < np[1] || (h == np[1] && l <= np[0]));
sub_ddmmss (h, l, np[1], np[0], h, l);
+ /* divide by 2 */
l = (h << 63) | (l >> 1);
h = h >> 1;
- /* now add (2^64+x) * (h*2^64+l) / 2^64 to r0 */
+ /* now add (2^64+x) * (h*2^64+l) / 2^64 to [r1*2^64, 0] */
umul_ppmm (r0, t, x, l); /* x * l */
r0 += l;
@@ -147,16 +165,10 @@ mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np)
r1 += (r0 < x); /* add x */
}
- if (r1 & MPFR_LIMB_HIGHBIT)
- {
- rp[0] = r0;
- rp[1] = r1;
- }
- else /* overflow occurred in r1 */
- {
- rp[0] = ~MPFR_LIMB_ZERO;
- rp[1] = ~MPFR_LIMB_ZERO;
- }
+ MPFR_ASSERTD(r1 & MPFR_LIMB_HIGHBIT);
+
+ rp[0] = r0;
+ rp[1] = r1;
}
/* Special code for prec(r), prec(u) < GMP_NUMB_BITS. We cannot have
@@ -312,8 +324,10 @@ mpfr_sqrt2 (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
mask = MPFR_LIMB_MASK(sh);
mpfr_sqrt2_approx (rp, np + 2);
- /* the error is less than 43 ulps on rp[0] */
- if (((rp[0] + 42) & (mask >> 1)) > 84)
+ /* the error is at most 27 ulps on rp[0], with {rp, 2} smaller or equal
+ to the exact square root, thus we can round correctly except when the
+ number formed by the last sh-1 bits of rp[0] is 0, -1, -2, ..., -27. */
+ if (((rp[0] + 27) & (mask >> 1)) > 27)
sb = 1;
else
{
diff --git a/src/strtofr.c b/src/strtofr.c
index efd5ef5ff..b1c62cda3 100644
--- a/src/strtofr.c
+++ b/src/strtofr.c
@@ -532,7 +532,7 @@ parsed_string_to_mpfr (mpfr_t x, struct parsed_string *pstr, mpfr_rnd_t rnd)
if (real_ysize != ysize)
{
if (count == 0)
- MPN_COPY_DECR (y + ysize - real_ysize, y, real_ysize);
+ mpn_copyd (y + ysize - real_ysize, y, real_ysize);
MPN_ZERO (y, ysize - real_ysize);
}
/* for each bit shift decrease exponent of y */
diff --git a/src/sub1sp.c b/src/sub1sp.c
index f9eee75dd..b469b25a5 100644
--- a/src/sub1sp.c
+++ b/src/sub1sp.c
@@ -620,6 +620,8 @@ mpfr_sub1sp3 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
if (d < GMP_NUMB_BITS)
{
mp_limb_t cy;
+ /* warning: we must have the most significant bit of sb correct
+ since it might become the round bit below */
sb = cp[0] << (GMP_NUMB_BITS - d); /* neglected part of c */
a0 = bp[0] - ((cp[1] << (GMP_NUMB_BITS - d)) | (cp[0] >> d));
a1 = bp[1] - ((cp[2] << (GMP_NUMB_BITS - d)) | (cp[1] >> d))
@@ -681,6 +683,8 @@ mpfr_sub1sp3 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
else if (d < 2 * GMP_NUMB_BITS)
{
mp_limb_t c0shifted;
+ /* warning: we must have the most significant bit of sb correct
+ since it might become the round bit below */
sb = (d == GMP_NUMB_BITS) ? cp[0]
: (cp[1] << (2*GMP_NUMB_BITS - d)) | (cp[0] != 0);
c0shifted = (d == GMP_NUMB_BITS) ? cp[1]
@@ -980,8 +984,8 @@ mpfr_sub1sp (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode)
mpn_lshift (ap + len, ap, k, cnt); /* Normalize the High Limb*/
else
{
- /* Must use DECR since src and dest may overlap & dest>=src*/
- MPN_COPY_DECR(ap+len, ap, k);
+ /* Must use copyd since src and dst may overlap & dst>=src */
+ mpn_copyd (ap+len, ap, k);
}
MPN_ZERO(ap, len); /* Zeroing the last limbs */
bx -= cnt + len*GMP_NUMB_BITS; /* Update Expo */
diff --git a/src/sum.c b/src/sum.c
index 6f7268240..9fd9d693d 100644
--- a/src/sum.c
+++ b/src/sum.c
@@ -473,7 +473,7 @@ sum_raw (mp_limb_t *wp, mp_size_t ws, mpfr_prec_t wq, mpfr_ptr *const x,
if (MPFR_LIKELY (shiftc != 0))
mpn_lshift (wp + shifts, wp, ws - shifts, shiftc);
else
- MPN_COPY_DECR (wp + shifts, wp, ws - shifts);
+ mpn_copyd (wp + shifts, wp, ws - shifts);
MPN_ZERO (wp, shifts);
/* Compute minexp = minexp - shiftq safely. */
SAFE_SUB (minexp, minexp, shiftq);