Merged r11179-11196 from the trunk (no conflicts).

git-svn-id: svn://scm.gforge.inria.fr/svn/mpfr/branches/faithful@11453 280ebfd0-de03-0410-8827-d642c229c3f4
author: vlefevre <vlefevre@280ebfd0-de03-0410-8827-d642c229c3f4> 2017-05-04 07:46:57 +0000
committer: vlefevre <vlefevre@280ebfd0-de03-0410-8827-d642c229c3f4> 2017-05-04 07:46:57 +0000
commit: 6de18e36a603b2b4c3f065119de5d5fd78367b51 (patch)
tree: 333e193c333ea5369d4fcd17e5fbb469dd50c40d /src
parent: c64ffc2f9958b4fa2eab676fee377ef0fbcaddba (diff)
download: mpfr-6de18e36a603b2b4c3f065119de5d5fd78367b51.tar.gz
13 files changed, 291 insertions, 93 deletions
diff --git a/src/add1sp.c b/src/add1sp.c
index 03a8ba929..4182cfcf1 100644
--- a/src/add1sp.c
+++ b/src/add1sp.c
@@ -123,6 +123,8 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
 
   if (bx == cx)
     {
+      /* TODO: a0 = (bp[0] >> 1) + (cp[0] >> 1) is probably better
+         (no long constant to load in a register). */
       /* since bp[0], cp[0] >= MPFR_LIMB_HIGHBIT, a carry always occurs */
       a0 = MPFR_LIMB_HIGHBIT | ((bp[0] + cp[0]) >> 1);
       bx ++;
@@ -135,14 +137,17 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
     BGreater1:
       d = (mpfr_uexp_t) bx - cx;
       mask = MPFR_LIMB_MASK(sh);
+      /* TODO: Should the case d < sh be removed, i.e. seen as a particular
+         case of d < GMP_NUMB_BITS? This case would do a bit more operations
+         but a test would be removed, avoiding pipeline stall issues. */
       if (d < sh)
         {
           /* we can shift c by d bits to the right without losing any bit,
              moreover we can shift one more if there is an exponent increase */
-          rb = bp[0];
-          a0 = rb + (cp[0] >> d);
-          if (a0 < rb) /* carry */
+          a0 = bp[0] + (cp[0] >> d);
+          if (a0 < bp[0]) /* carry */
             {
+              MPFR_ASSERTD ((a0 & 1) == 0);
               a0 = MPFR_LIMB_HIGHBIT | (a0 >> 1);
               bx ++;
             }
@@ -152,10 +157,9 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
         }
       else if (d < GMP_NUMB_BITS) /* sh <= d < GMP_NUMB_BITS */
         {
-          rb = bp[0];
           sb = cp[0] << (GMP_NUMB_BITS - d); /* bits from cp[-1] after shift */
-          a0 = rb + (cp[0] >> d);
-          if (a0 < rb)
+          a0 = bp[0] + (cp[0] >> d);
+          if (a0 < bp[0]) /* carry */
             {
               sb |= a0 & 1;
               a0 = MPFR_LIMB_HIGHBIT | (a0 >> 1);
@@ -181,9 +185,9 @@ mpfr_add1sp1 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
       goto BGreater1;
     }
 
-  /* Note: we could keep the output significand in a0 for the rounding, and only
-     store it in ap[0] at the very end, but this seems slower on average (but better
-     for the worst case). */
+  /* Note: we could keep the output significand in a0 for the rounding,
+     and only store it in ap[0] at the very end, but this seems slower
+     on average (but better for the worst case). */
 
   /* now perform rounding */
   if (MPFR_UNLIKELY(bx > __gmpfr_emax))
diff --git a/src/div.c b/src/div.c
index 138f6bb32..d628b9e1b 100644
--- a/src/div.c
+++ b/src/div.c
@@ -37,6 +37,7 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
 
 #include "invert_limb.h"
 
+#if 1
 /* Given u = u1*B+u0 < d = d1*B+d0 with d normalized (high bit of d1 set),
    put in v = v1*B+v0 an approximation of floor(u*B^2/d), with:
    B = 2^GMP_NUMB_BITS and v <= floor(u*B^2/d) <= v + 16.
@@ -78,11 +79,87 @@ mpfr_div2_approx (mpfr_limb_ptr v1, mpfr_limb_ptr v0,
       *v1 += 1 + (*v0 < x);
     }
 }
+#else
+/* the following is some experimental code, not fully proven correct,
+   and which does not seem to be faster, unless we find a way to speed up
+   the while loop (if 'while' is replaced by 'if' it becomes faster than
+   the above code, but is then wrong) */
+static void
+mpfr_div2_approx (mpfr_limb_ptr Q1, mpfr_limb_ptr Q0,
+                  mp_limb_t u1, mp_limb_t u0,
+                  mp_limb_t v1, mp_limb_t v0)
+{
+  mp_limb_t inv, q1, q0, r2, r1, r0, cy;
+
+  /* first compute an approximation of q1 */
+  if (MPFR_UNLIKELY(v1 == MPFR_LIMB_MAX))
+    inv = MPFR_LIMB_ZERO;
+  else
+    __gmpfr_invert_limb_approx (inv, v1 + 1);
+  /* now inv <= B^2/(v1+1) - B */
+  umul_ppmm (q1, q0, u1, inv);
+  q1 += u1;
+
+  /* we have q1 <= floor(u1*2^GMP_NUMB_BITS/v1) <= q1 + 2 */
+
+  umul_ppmm (r2, r1, q1, v1);
+  umul_ppmm (cy, r0, q1, v0);
+  r1 += cy;
+  r2 += (r1 < cy);
+
+  /* Now r2:r1:r0 = q1 * v1:v0. While q1*v1 <= u1:0, it might be that
+     r2:r1 > u1:u0. First subtract r2:r1:r0 from u1:u0:0, with result
+     in r2:r1:r0. */
+  r0 = -r0;
+  r1 = u0 - r1 - (r0 != 0);
+  r2 = u1 - r2 - (r1 > u0 || (r1 == u0 && r0 != 0));
+
+  /* u1:u0:0 - q1 * v1:v0 >= u1*B^2 - q1*(v1*B+B-1) >= -q1*B > -B^2
+     thus r2 can be 0 or -1 here. */
+  if (r2 & MPFR_LIMB_HIGHBIT) /* correction performed at most two times */
+    {
+      ADD_LIMB(r0, v0, cy); /* r0 += v0; cy = r0 < v0 */
+      ADD_LIMB(r1, v1, cy);
+      r2 += cy;
+      q1 --;
+      if (r2 & MPFR_LIMB_HIGHBIT)
+        {
+          ADD_LIMB(r0, v0, cy); /* r0 += v0; cy = r0 < v0 */
+          ADD_LIMB(r1, v1, cy);
+          r2 += cy;
+          q1 --;
+        }
+    }
+  else
+    {
+      /* experimentally, the number of loops is <= 5 */
+      while (r2 || r1 > v1 || (r1 == v1 && r0 >= v0))
+        {
+          r2 -= (r1 < v1 || (r1 == v1 && r0 < v0));
+          r1 -= v1 + (r0 < v0);
+          r0 -= v0;
+          q1 ++;
+        }
+    }
+
+  /* now u1:u0:0 = q1 * d1:d0 + r1:r0, with 0 <= r1:r0 < d1:d0 */
+  MPFR_ASSERTD(r2 == MPFR_LIMB_ZERO);
+  MPFR_ASSERTD(r1 < v1 || (r1 == v1 && r0 < v0));
+
+  /* the second quotient limb is approximated by r1*B / d1,
+     which is itself approximated by r1+(r1*inv/B) */
+  umul_ppmm (q0, r0, r1, inv);
+
+  *Q1 = q1;
+  *Q0 = r1 + q0;
+
+  /* experimentally: q1:q0 <= floor(B^2*u/v) <= q1:q0 + 5 */
+}
+#endif
 
 #endif /* GMP_NUMB_BITS == 64 */
 
-/* Special code for p=PREC(q) < GMP_NUMB_BITS,
-   and PREC(u), PREC(v) <= GMP_NUMB_BITS */
+/* Special code for PREC(q) = PREC(u) = PREC(v) = p < GMP_NUMB_BITS */
 static int
 mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
 {
@@ -99,31 +176,60 @@ mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
     u0 -= v0;
 
 #if GMP_NUMB_BITS == 64 /* __gmpfr_invert_limb_approx only exists for 64-bit */
-  /* first try with an approximate quotient */
+  /* First try with an approximate quotient.
+     FIXME: for p<=62 we have sh-1<2 and will never be able to round correctly.
+     Even for p=61 we have sh-1=2 and we can round correctly only when the two
+     last bist of q0 are 01, which happens with probability 25% only. */
   {
     mp_limb_t inv;
     __gmpfr_invert_limb_approx (inv, v0);
-    umul_ppmm (q0, sb, u0, inv);
+    umul_ppmm (rb, sb, u0, inv);
   }
-  q0 = (q0 + u0) >> extra;
-  /* before the >> extra shift, q0 + u0 does not exceed the true quotient
-     floor(u'0*2^GMP_NUMB_BITS/v0), with error at most 2, which means the rational
-     quotient q satisfies q0 + u0 <= q < q0 + u0 + 3. We can round correctly except
-     when the last sh-1 bits of q0 are 000..000 or 111..111 or 111..110. */
+  rb += u0;
+  q0 = rb >> extra;
+  /* rb does not exceed the true quotient floor(u0*2^GMP_NUMB_BITS/v0),
+     with error at most 2, which means the rational quotient q satisfies
+     rb <= q < rb + 3. We can round correctly except when the last sh-1 bits
+     of q0 are 000..000 or 111..111 or 111..110. */
   if (MPFR_LIKELY(((q0 + 2) & (mask >> 1)) > 2))
     {
       rb = q0 & (MPFR_LIMB_ONE << (sh - 1));
-      sb = 1; /* result cannot be exact when we can round with an approximation */
+      sb = 1; /* result cannot be exact in this case */
     }
-  else /* compute exact quotient and remainder */
-#endif
+  else /* the true quotient is rb, rb+1 or rb+2 */
     {
-      udiv_qrnnd (q0, sb, u0, 0, v0);
-      sb |= q0 & extra;
+      mp_limb_t h, l;
+      q0 = rb;
+      umul_ppmm (h, l, q0, v0);
+      MPFR_ASSERTD(h < u0 || (h == u0 && l == MPFR_LIMB_ZERO));
+      /* subtract {h,l} from {u0,0} */
+      sub_ddmmss (h, l, u0, 0, h, l);
+      /* the remainder {h, l} should be < v0 */
+      if (h || l >= v0)
+        {
+          q0 ++;
+          h -= (l < v0);
+          l -= v0;
+        }
+      if (h || l >= v0)
+        {
+          q0 ++;
+          h -= (l < v0);
+          l -= v0;
+        }
+      MPFR_ASSERTD(h == 0 && l < v0);
+      sb = l | (q0 & extra);
       q0 >>= extra;
       rb = q0 & (MPFR_LIMB_ONE << (sh - 1));
       sb |= q0 & (mask >> 1);
     }
+#else
+  udiv_qrnnd (q0, sb, u0, 0, v0);
+  sb |= q0 & extra;
+  q0 >>= extra;
+  rb = q0 & (MPFR_LIMB_ONE << (sh - 1));
+  sb |= q0 & (mask >> 1);
+#endif
 
   qp[0] = (MPFR_LIMB_HIGHBIT | q0) & ~mask;
   qx += extra;
@@ -165,7 +271,12 @@ mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
     }
   else if (rnd_mode == MPFR_RNDN)
     {
-      if (rb == 0 || (sb == 0 && (qp[0] & (MPFR_LIMB_ONE << sh)) == 0))
+      /* It is not possible to have rb <> 0 and sb = 0 here, since it would
+         mean a n-bit by n-bit division gives an exact (n+1)-bit number.
+         And since the case rb = sb = 0 was already dealt with, we cannot
+         have sb = 0. Thus we cannot be in the middle of two numbers. */
+      MPFR_ASSERTD(sb != 0);
+      if (rb == 0)
         goto truncate;
       else
         goto add_one_ulp;
@@ -194,7 +305,7 @@ mpfr_div_1 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
 }
 
 /* Special code for GMP_NUMB_BITS < PREC(q) < 2*GMP_NUMB_BITS and
-   GMP_NUMB_BITS < PREC(u), PREC(v) <= 2*GMP_NUMB_BITS */
+   PREC(u) = PREC(v) = PREC(q) */
 static int
 mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
 {
@@ -220,10 +331,42 @@ mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
   /* we know q1*B+q0 is smaller or equal to the exact quotient, with
      difference at most 16 */
   if (MPFR_LIKELY(((q0 + 16) & (mask >> 1)) > 16))
+    sb = 1; /* result is not exact when we can round with an approximation */
+  else
     {
-      sb = 1; /* result is not exact when we can round with an approximation */
-      goto round_div2;
+      /* we know q1:q0 is a good-enough approximation, use it! */
+      mp_limb_t qq[2], uu[4];
+
+      qq[0] = q0;
+      qq[1] = q1;
+      /* FIXME: instead of using mpn_mul_n, we can use 3 umul_ppmm calls,
+         since we know the difference should at most 16*(v1:v0) after the
+         subtraction below, thus at most 16*2^128. */
+      mpn_mul_n (uu, qq, MPFR_MANT(v), 2);
+      /* we now should have uu[3]:uu[2] <= r3:r2 */
+      MPFR_ASSERTD(uu[3] < r3 || (uu[3] == r3 && uu[2] <= r2));
+      /* subtract {uu,4} from r3:r2:0:0, with result in {uu,4} */
+      uu[2] = r2 - uu[2];
+      uu[3] = r3 - uu[3] - (uu[2] > r2);
+      /* now negate uu[1]:uu[0] */
+      uu[0] = -uu[0];
+      uu[1] = -uu[1] - (uu[0] != 0);
+      /* there is a borrow in uu[2] when uu[0] and uu[1] are not both zero */
+      uu[3] -= (uu[1] != 0 || uu[0] != 0) && (uu[2] == 0);
+      uu[2] -= (uu[1] != 0 || uu[0] != 0);
+      MPFR_ASSERTD(uu[3] == MPFR_LIMB_ZERO);
+      while (uu[2] > 0 || (uu[1] > v1) || (uu[1] == v1 && uu[0] >= v0))
+        {
+          /* add 1 to q1:q0 */
+          q0 ++;
+          q1 += (q0 == 0);
+          /* subtract v1:v0 to u2:u1:u0 */
+          uu[2] -= (uu[1] < v1) || (uu[1] == v1 && uu[0] < v0);
+          sub_ddmmss (uu[1], uu[0], uu[1], uu[0], v1, v0);
+        }
+      sb = uu[1] | uu[0];
     }
+  goto round_div2;
 #endif
 
   /* now r3:r2 < v1:v0 */
@@ -334,6 +477,9 @@ mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
 
   sb = r1 | r0;
 
+  /* here, q1:q0 should be an approximation of the quotient (or the exact
+     quotient), and sb the sticky bit */
+
 #if GMP_NUMB_BITS == 64
  round_div2:
 #endif
@@ -389,8 +535,9 @@ mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
     }
   else if (rnd_mode == MPFR_RNDN)
     {
-      if (rb == 0 || (rb && sb == 0 &&
-                      (qp[0] & (MPFR_LIMB_ONE << sh)) == 0))
+      /* See the comment in mpfr_div_1. */
+      MPFR_ASSERTD(sb != 0);
+      if (rb == 0)
         goto truncate;
       else
         goto add_one_ulp;
@@ -729,20 +876,22 @@ mpfr_div (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
         }
     }
 
-  usize = MPFR_LIMB_SIZE(u);
-  vsize = MPFR_LIMB_SIZE(v);
-
   /* When MPFR_GENERIC_ABI is defined, we don't use special code. */
 #if !defined(MPFR_GENERIC_ABI)
+  if (MPFR_GET_PREC(u) == MPFR_GET_PREC(q) &&
+      MPFR_GET_PREC(v) == MPFR_GET_PREC(q))
+    {
+      if (MPFR_GET_PREC(q) < GMP_NUMB_BITS)
+        return mpfr_div_1 (q, u, v, rnd_mode);
 
-  if (MPFR_GET_PREC(q) < GMP_NUMB_BITS && usize == 1 && vsize == 1)
-    return mpfr_div_1 (q, u, v, rnd_mode);
-
-  if (GMP_NUMB_BITS < MPFR_GET_PREC(q) && MPFR_GET_PREC(q) < 2 * GMP_NUMB_BITS
-      && usize == 2 && vsize == 2)
+      if (GMP_NUMB_BITS < MPFR_GET_PREC(q) &&
+          MPFR_GET_PREC(q) < 2 * GMP_NUMB_BITS)
     return mpfr_div_2 (q, u, v, rnd_mode);
+    }
 #endif /* !defined(MPFR_GENERIC_ABI) */
 
+  usize = MPFR_LIMB_SIZE(u);
+  vsize = MPFR_LIMB_SIZE(v);
   q0size = MPFR_LIMB_SIZE(q); /* number of limbs of destination */
   q0p = MPFR_MANT(q);
   up = MPFR_MANT(u);
diff --git a/src/frac.c b/src/frac.c
index 46d4cddd7..f6bdd1bdd 100644
--- a/src/frac.c
+++ b/src/frac.c
@@ -108,7 +108,7 @@ mpfr_frac (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
   t0 = tn - un;
   tp = MPFR_MANT(t);
   if (sh == 0)
-    MPN_COPY_DECR(tp + t0, up, un + 1);
+    mpn_copyd (tp + t0, up, un + 1);
   else /* warning: un may be 0 here */
     tp[tn] = k | ((un) ? mpn_lshift (tp + t0, up, un, sh) : MPFR_LIMB_ZERO);
   if (t0 > 0)
diff --git a/src/mpfr-gmp.h b/src/mpfr-gmp.h
index af30f5f7c..cad80346c 100644
--- a/src/mpfr-gmp.h
+++ b/src/mpfr-gmp.h
@@ -126,10 +126,6 @@ void *alloca (size_t);
 
 /* MP_LIMB macros */
 #define MPN_ZERO(dst, n) memset((dst), 0, (n)*MPFR_BYTES_PER_MP_LIMB)
-/* TODO: add MPFR_ASSERTD assertions for MPN_COPY_DECR and MPN_COPY_INCR,
-   even though memmove works with any overlapping. Useful to detect bugs! */
-#define MPN_COPY_DECR(dst,src,n) memmove((dst),(src),(n)*MPFR_BYTES_PER_MP_LIMB)
-#define MPN_COPY_INCR(dst,src,n) memmove((dst),(src),(n)*MPFR_BYTES_PER_MP_LIMB)
 #define MPN_COPY(dst,src,n) \
   do                                                                  \
     {                                                                 \
@@ -454,12 +450,13 @@ typedef struct {mp_limb_t inv32;} mpfr_pi1_t;
   } while (0)
 #endif
 
-/* mpn_copyd is a new exported function in GMP 5.
-   It existed in GMP 4 in the internal header, but still may not be
-   defined if HAVE_NATIVE_mpn_copyd is not defined */
+/* mpn_copyi and mpn_copyd are new exported functions in GMP 5.
+   Defining them to memmove works in overlap cases. */
 #if !__MPFR_GMP(5,0,0)
+# undef  mpn_copyi
+# define mpn_copyi memmove
 # undef  mpn_copyd
-# define mpn_copyd MPN_COPY
+# define mpn_copyd memmove
 #endif
 
 /* The following macro is copied from GMP-6.1.1, file gmp-impl.h,
diff --git a/src/mpfr-impl.h b/src/mpfr-impl.h
index 5eef0c61c..88765d8a5 100644
--- a/src/mpfr-impl.h
+++ b/src/mpfr-impl.h
@@ -1443,6 +1443,9 @@ do {                                                                  \
 /* Size of an array, as a constant expression. */
 #define numberof_const(x)  (sizeof (x) / sizeof ((x)[0]))
 
+/* Addition with carry (detected by GCC and other good compilers). */
+#define ADD_LIMB(u,v,c) ((u) += (v), (c) = (u) < (v))
+
 
 /******************************************************
  ************  Save exponent/flags macros  ************
diff --git a/src/rndna.c b/src/rndna.c
index 839fa138b..11866eb3c 100644
--- a/src/rndna.c
+++ b/src/rndna.c
@@ -87,7 +87,7 @@ mpfr_round_nearest_away_begin (mpfr_t rop)
   ext   = (mpfr_size_limb_extended_t *)
     (*__gmp_allocate_func) (MPFR_MALLOC_EXTENDED_SIZE(xsize));
 
-  /* Save the context first */
+  /* Save the context first. */
   ext[ALLOC_SIZE].si   = xsize;
   ext[OLD_MANTISSA].pi = MPFR_MANT(rop);
   ext[OLD_EXPONENT].ex = MPFR_EXP(rop);
@@ -103,15 +103,20 @@ mpfr_round_nearest_away_begin (mpfr_t rop)
   MPFR_MANT(tmp) =  (mp_limb_t*)(ext+MANTISSA); /* Set Mantissa ptr */
   MPFR_SET_NAN(tmp);                            /* initializes to NaN */
 
-  /* Copy rop into tmp now (rop may be used as input latter). */
+  /* Note: This full initialization to NaN may be unnecessary because of
+     the mpfr_set just below, but this is cleaner in case internals would
+     change in the future (for instance, some fields of tmp could be read
+     before being set, and reading an uninitialized value is UB here). */
+
+  /* Copy rop into tmp now (rop may be used as input later). */
   MPFR_DBGRES (inexact = mpfr_set(tmp, rop, MPFR_RNDN));
   MPFR_ASSERTD(inexact == 0); /* Shall be exact since prec(tmp) > prec(rop) */
 
-  /* Overwrite rop with tmp */
+  /* Overwrite rop with tmp. */
   rop[0] = tmp[0];
 
-  /* rop now contains a copy of rop, with one extra bit of precision
-     while keeping the old rop "hidden" within its mantissa. */
+  /* The new rop now contains a copy of the old rop, with one extra bit of
+     precision while keeping the old rop "hidden" within its mantissa. */
 
   return;
 }
diff --git a/src/round_raw_generic.c b/src/round_raw_generic.c
index 8501b794a..66dc9ed4f 100644
--- a/src/round_raw_generic.c
+++ b/src/round_raw_generic.c
@@ -103,7 +103,7 @@ mpfr_round_raw_generic(
 
   if (MPFR_UNLIKELY(xprec <= yprec))
     { /* No rounding is necessary. */
-      /* if yp=xp, maybe an overlap: MPN_COPY_DECR is OK when src <= dst */
+      /* if yp=xp, maybe an overlap: mpn_copyd is OK when src <= dst */
       if (MPFR_LIKELY(rw))
         nw++;
       MPFR_ASSERTD(nw >= 1);
@@ -111,7 +111,7 @@ mpfr_round_raw_generic(
       if (use_inexp)
         *inexp = 0;
 #if flag == 0
-      MPN_COPY_DECR(yp + (nw - xsize), xp, xsize);
+      mpn_copyd (yp + (nw - xsize), xp, xsize);
       MPN_ZERO(yp, nw - xsize);
 #endif
       return 0;
@@ -159,7 +159,7 @@ mpfr_round_raw_generic(
                   /* ((neg!=0)^(sb!=0)) ? MPFR_EVEN_INEX : -MPFR_EVEN_INEX */
                   /* since neg = 0 or 1 and sb = 0 */
 #if flag == 0
-                  MPN_COPY_INCR(yp, xp + xsize - nw, nw);
+                  mpn_copyi (yp, xp + xsize - nw, nw);
                   yp[0] &= himask;
 #endif
                   return 0; /* sb != 0 && rnd_mode != MPFR_RNDZ */
@@ -204,7 +204,7 @@ mpfr_round_raw_generic(
             /* ((neg != 0) ^ (rnd_mode != MPFR_RNDZ)) ? 1 : -1 */
             *inexp = MPFR_UNLIKELY(sb == 0) ? 0 : (2*neg-1);
 #if flag == 0
-          MPN_COPY_INCR(yp, xp + xsize - nw, nw);
+          mpn_copyi (yp, xp + xsize - nw, nw);
           yp[0] &= himask;
 #endif
           return 0; /* sb != 0 && rnd_mode != MPFR_RNDZ */
@@ -221,7 +221,7 @@ mpfr_round_raw_generic(
                 /* ((neg != 0) ^ (rnd_mode != MPFR_RNDZ)) ? 1 : -1 */
                 *inexp = 0;
 #if flag == 0
-              MPN_COPY_INCR(yp, xp + xsize - nw, nw);
+              mpn_copyi (yp, xp + xsize - nw, nw);
               yp[0] &= himask;
 #endif
               return 0;
@@ -254,7 +254,7 @@ mpfr_round_raw_generic(
         }
       else
         himask = MPFR_LIMB_MAX;
-      MPN_COPY_INCR(yp, xp + xsize - nw, nw);
+      mpn_copyi (yp, xp + xsize - nw, nw);
       yp[0] &= himask;
 #endif
       return 0;
diff --git a/src/set_uj.c b/src/set_uj.c
index 50d77114a..fafc101c2 100644
--- a/src/set_uj.c
+++ b/src/set_uj.c
@@ -95,7 +95,7 @@ mpfr_set_uj_2exp (mpfr_t x, uintmax_t j, intmax_t e, mpfr_rnd_t rnd)
       if (MPFR_LIKELY (cnt != 0))
         mpn_lshift (yp+len, yp, k, cnt);  /* Normalize the high limb */
       else if (len != 0)
-        MPN_COPY_DECR (yp+len, yp, k);    /* Must use DECR */
+        mpn_copyd (yp+len, yp, k);    /* Must use copyd */
       if (len != 0)
         {
           if (len == 1)
diff --git a/src/sqr.c b/src/sqr.c
index 3cd826567..d5c1570e9 100644
--- a/src/sqr.c
+++ b/src/sqr.c
@@ -119,28 +119,50 @@ mpfr_sqr_1 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
 
 /* Special code for GMP_NUMB_BITS < prec(a) < 2*GMP_NUMB_BITS and
    GMP_NUMB_BITS < prec(b) <= 2*GMP_NUMB_BITS.
-   Note: this function was copied from mpfr_mul_2 in file mul.c, thus any change
-   here should be done also in mpfr_mul_2. */
+   Note: this function was copied and optimized from mpfr_mul_2 in file mul.c,
+   thus any change here should be done also in mpfr_mul_2, if applicable. */
 static int
 mpfr_sqr_2 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
 {
   mp_limb_t h, l, u, v;
   mpfr_limb_ptr ap = MPFR_MANT(a);
-  mpfr_exp_t ax = MPFR_GET_EXP(b) * 2;
+  mpfr_exp_t ax = 2 * MPFR_GET_EXP(b);
   mpfr_prec_t sh = 2 * GMP_NUMB_BITS - p;
   mp_limb_t rb, sb, sb2, mask = MPFR_LIMB_MASK(sh);
   mp_limb_t *bp = MPFR_MANT(b);
 
   /* we store the 4-limb product in h=ap[1], l=ap[0], sb=ap[-1], sb2=ap[-2] */
   umul_ppmm (h, l, bp[1], bp[1]);
-  umul_ppmm (sb, sb2, bp[0], bp[0]);
   umul_ppmm (u, v, bp[1], bp[0]);
-  add_ssaaaa (l, sb, l, sb, u, v);
-  /* warning: (l < u) is incorrect to detect a carry out of add_ssaaaa, since we
-     might have u = 111...111, a carry coming from sb+v, thus l = u */
-  h += (l < u) || (l == u && sb < v);
-  add_ssaaaa (l, sb, l, sb, u, v);
-  h += (l < u) || (l == u && sb < v);
+  l += u << 1;
+  h += (l < (u << 1)) + (u >> (GMP_NUMB_BITS - 1));
+
+  /* now the full square is {h, l, 2*v + high(b0*c0), low(b0*c0)},
+     where the lower part contributes to less than 3 ulps to {h, l} */
+
+  /* If h has its most significant bit set and the low sh-1 bits of l are not
+     000...000 nor 111...111 nor 111...110, then we can round correctly;
+     if h has zero as most significant bit, we have to shift left h and l,
+     thus if the low sh-2 bits are not 000...000 nor 111...111 nor 111...110,
+     then we can round correctly. To avoid an extra test we consider the latter
+     case (if we can round, we can also round in the former case).
+     For sh <= 3, we have mask <= 7, thus (mask>>2) <= 1, and the approximation
+     cannot be enough. */
+  if (MPFR_LIKELY(((l + 2) & (mask >> 2)) > 2))
+    sb = sb2 = 1; /* result cannot be exact in that case */
+  else
+    {
+      mp_limb_t carry1, carry2;
+
+      umul_ppmm (sb, sb2, bp[0], bp[0]);
+      /* the full product is {h, l, sb + v + w, sb2} */
+      ADD_LIMB (sb, v, carry1);
+      ADD_LIMB (l, carry1, carry2);
+      h += carry2;
+      ADD_LIMB (sb, v, carry1);
+      ADD_LIMB (l, carry1, carry2);
+      h += carry2;
+    }
   if (h < MPFR_LIMB_HIGHBIT)
     {
       ax --;
diff --git a/src/sqrt.c b/src/sqrt.c
index ee3175fc1..853cb360c 100644
--- a/src/sqrt.c
+++ b/src/sqrt.c
@@ -85,19 +85,33 @@ mpfr_sqrt1_approx (mp_limb_t n)
     }                                    \
   while (0)
 
-/* Put in rp[1]*2^64+rp[0] an approximation of sqrt(2^128*n),
+/* Put in rp[1]*2^64+rp[0] an approximation of floor(sqrt(2^128*n)),
    with 2^126 <= n := np[1]*2^64 + np[0] < 2^128.
-   The error on {rp, 2} is less than 43 ulps (in unknown direction).
+   The error on {rp, 2} is at most 27 ulps, with {rp, 2} <= floor(sqrt(2^128*n)).
 */
 static void
 mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np)
 {
-  int i = np[1] >> 56;
+  int i;
   mp_limb_t x, r1, r0, h, l, t;
   const mp_limb_t *u;
   const mp_limb_t magic = 0xda9fbe76c8b43800; /* ceil(0.854*2^64) */
 
-  x = np[1] << 8 | (np[0] >> 56);
+  /* We round x upward to ensure {np,2} >= r1^2 below. Since we consider also the upper 8
+     bits of np[0] into x, we have to deal separately with the case np[1] = 2^64-1 and the
+     upper 8 bits of np[0] are all 1. */
+  l = np[0] + MPFR_LIMB_MASK(56);
+  h = np[1] + (l < np[0]);
+  i = h >> 56;
+  x = (h << 8) | (l >> 56);
+  if (MPFR_UNLIKELY(i == 0))
+    /* this happens when np[1] = 2^64-1, the upper 8 bits of np[0] are all 1, and the lower
+       56 bits of np[0] are not all zero */
+    {
+      MPFR_ASSERTD(x == MPFR_LIMB_ZERO);
+      goto compute_r1;
+    }
+  MPFR_ASSERTD(64 <= i && i < 256);
   u = V[i - 64];
   umul_ppmm (h, l, u[8], x);
   /* the truncation error on h is at most 1 here */
@@ -116,12 +130,14 @@ mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np)
                                          truncation error on h+l/2^64 is <= 6/2^6 */
   sub_ddmmss (h, l, u[0], u[1], h, l);
   /* Since the mathematical error is < 0.412e-19*2^64, the total error on
-     h + l/2^64 is less than 0.854; magic = ceil(0.854*2^64). */
+     h + l/2^64 is less than 0.854; magic = ceil(0.854*2^64). We subtract it,
+     while keeping x >= 0. */
   x = h - (l < magic && h != 0);
 
-  /* now 2^64 + x is an approximation of 2^96/sqrt(np[1]),
-     with 2^64 + x < 2^96/sqrt(np[1]) */
-
+  /* now 2^64 + x is an approximation of 2^96/sqrt(np[1]+1),
+     with 2^64 + x <= 2^96/sqrt(np[1]+1) */
+  
+ compute_r1:
   umul_ppmm (r1, l, np[1], x);
   r1 += np[1];
 
@@ -132,11 +148,13 @@ mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np)
     r1 = MPFR_LIMB_HIGHBIT;
 
   umul_ppmm (h, l, r1, r1);
+  MPFR_ASSERTD(h < np[1] || (h == np[1] && l <= np[0]));
   sub_ddmmss (h, l, np[1], np[0], h, l);
+  /* divide by 2 */
   l = (h << 63) | (l >> 1);
   h = h >> 1;
 
-  /* now add (2^64+x) * (h*2^64+l) / 2^64 to r0 */
+  /* now add (2^64+x) * (h*2^64+l) / 2^64 to [r1*2^64, 0] */
 
   umul_ppmm (r0, t, x, l); /* x * l */
   r0 += l;
@@ -147,16 +165,10 @@ mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np)
       r1 += (r0 < x); /* add x */
     }
 
-  if (r1 & MPFR_LIMB_HIGHBIT)
-    {
-      rp[0] = r0;
-      rp[1] = r1;
-    }
-  else /* overflow occurred in r1 */
-    {
-      rp[0] = ~MPFR_LIMB_ZERO;
-      rp[1] = ~MPFR_LIMB_ZERO;
-    }
+  MPFR_ASSERTD(r1 & MPFR_LIMB_HIGHBIT);
+
+  rp[0] = r0;
+  rp[1] = r1;
 }
 
 /* Special code for prec(r), prec(u) < GMP_NUMB_BITS. We cannot have
@@ -312,8 +324,10 @@ mpfr_sqrt2 (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
   mask = MPFR_LIMB_MASK(sh);
 
   mpfr_sqrt2_approx (rp, np + 2);
-  /* the error is less than 43 ulps on rp[0] */
-  if (((rp[0] + 42) & (mask >> 1)) > 84)
+  /* the error is at most 27 ulps on rp[0], with {rp, 2} smaller or equal
+     to the exact square root, thus we can round correctly except when the
+     number formed by the last sh-1 bits of rp[0] is 0, -1, -2, ..., -27. */
+  if (((rp[0] + 27) & (mask >> 1)) > 27)
     sb = 1;
   else
     {
diff --git a/src/strtofr.c b/src/strtofr.c
index efd5ef5ff..b1c62cda3 100644
--- a/src/strtofr.c
+++ b/src/strtofr.c
@@ -532,7 +532,7 @@ parsed_string_to_mpfr (mpfr_t x, struct parsed_string *pstr, mpfr_rnd_t rnd)
           if (real_ysize != ysize)
             {
               if (count == 0)
-                MPN_COPY_DECR (y + ysize - real_ysize, y, real_ysize);
+                mpn_copyd (y + ysize - real_ysize, y, real_ysize);
               MPN_ZERO (y, ysize - real_ysize);
             }
           /* for each bit shift decrease exponent of y */
diff --git a/src/sub1sp.c b/src/sub1sp.c
index f9eee75dd..b469b25a5 100644
--- a/src/sub1sp.c
+++ b/src/sub1sp.c
@@ -620,6 +620,8 @@ mpfr_sub1sp3 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
       if (d < GMP_NUMB_BITS)
         {
           mp_limb_t cy;
+          /* warning: we must have the most significant bit of sb correct
+             since it might become the round bit below */
           sb = cp[0] << (GMP_NUMB_BITS - d); /* neglected part of c */
           a0 = bp[0] - ((cp[1] << (GMP_NUMB_BITS - d)) | (cp[0] >> d));
           a1 = bp[1] - ((cp[2] << (GMP_NUMB_BITS - d)) | (cp[1] >> d))
@@ -681,6 +683,8 @@ mpfr_sub1sp3 (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode,
       else if (d < 2 * GMP_NUMB_BITS)
         {
           mp_limb_t c0shifted;
+          /* warning: we must have the most significant bit of sb correct
+             since it might become the round bit below */
           sb = (d == GMP_NUMB_BITS) ? cp[0]
             : (cp[1] << (2*GMP_NUMB_BITS - d)) | (cp[0] != 0);
           c0shifted = (d == GMP_NUMB_BITS) ? cp[1]
@@ -980,8 +984,8 @@ mpfr_sub1sp (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mpfr_rnd_t rnd_mode)
                 mpn_lshift (ap + len, ap, k, cnt); /* Normalize the High Limb*/
               else
                 {
-                  /* Must use DECR since src and dest may overlap & dest>=src*/
-                  MPN_COPY_DECR(ap+len, ap, k);
+                  /* Must use copyd since src and dst may overlap & dst>=src */
+                  mpn_copyd (ap+len, ap, k);
                 }
               MPN_ZERO(ap, len); /* Zeroing the last limbs */
               bx -= cnt + len*GMP_NUMB_BITS; /* Update Expo */
diff --git a/src/sum.c b/src/sum.c
index 6f7268240..9fd9d693d 100644
--- a/src/sum.c
+++ b/src/sum.c
@@ -473,7 +473,7 @@ sum_raw (mp_limb_t *wp, mp_size_t ws, mpfr_prec_t wq, mpfr_ptr *const x,
                 if (MPFR_LIKELY (shiftc != 0))
                   mpn_lshift (wp + shifts, wp, ws - shifts, shiftc);
                 else
-                  MPN_COPY_DECR (wp + shifts, wp, ws - shifts);
+                  mpn_copyd (wp + shifts, wp, ws - shifts);
                 MPN_ZERO (wp, shifts);
                 /* Compute minexp = minexp - shiftq safely. */
                 SAFE_SUB (minexp, minexp, shiftq);
author	vlefevre <vlefevre@280ebfd0-de03-0410-8827-d642c229c3f4>	2017-05-04 07:46:57 +0000
committer	vlefevre <vlefevre@280ebfd0-de03-0410-8827-d642c229c3f4>	2017-05-04 07:46:57 +0000
commit	6de18e36a603b2b4c3f065119de5d5fd78367b51 (patch)
tree	333e193c333ea5369d4fcd17e5fbb469dd50c40d /src
parent	c64ffc2f9958b4fa2eab676fee377ef0fbcaddba (diff)
download	mpfr-6de18e36a603b2b4c3f065119de5d5fd78367b51.tar.gz