[src/sqr.c] added special code for 3 limbs

git-svn-id: svn://scm.gforge.inria.fr/svn/mpfr/trunk@11169 280ebfd0-de03-0410-8827-d642c229c3f4
author: zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> 2017-01-09 14:41:14 +0000
committer: zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> 2017-01-09 14:41:14 +0000
commit: 85804140c8189414c428c3a86f767c258a72dff2 (patch)
tree: f0c7710f5b3f090416912716e0238ab58474cd7b /src/sqr.c
parent: 24da13c98a29ce7e842cd440c6135efe435b4eab (diff)
download: mpfr-85804140c8189414c428c3a86f767c258a72dff2.tar.gz
1 files changed, 169 insertions, 19 deletions
diff --git a/src/sqr.c b/src/sqr.c
index b06b95b9b..0cf6a4e13 100644
--- a/src/sqr.c
+++ b/src/sqr.c
@@ -1,4 +1,4 @@
-/* mpfr_sqr -- Floating square
+/* mpfr_sqr -- Floating-point square
 
 Copyright 2004-2017 Free Software Foundation, Inc.
 Contributed by the AriC and Caramba projects, INRIA.
@@ -58,7 +58,7 @@ mpfr_sqr_1 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
 
   /* rounding */
   if (MPFR_UNLIKELY(ax > __gmpfr_emax))
-    return mpfr_overflow (a, rnd_mode, MPFR_SIGN(a));
+    return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
 
   /* Warning: underflow should be checked *after* rounding, thus when rounding
      away and when a > 0.111...111*2^(emin-1), or when rounding to nearest and
@@ -76,7 +76,7 @@ mpfr_sqr_1 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
       if (rnd_mode == MPFR_RNDN &&
           (ax < __gmpfr_emin - 1 || (ap[0] == MPFR_LIMB_HIGHBIT && (rb | sb) == 0)))
         rnd_mode = MPFR_RNDZ;
-      return mpfr_underflow (a, rnd_mode, MPFR_SIGN(a));
+      return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
     }
 
  rounding:
@@ -98,7 +98,7 @@ mpfr_sqr_1 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
     {
     truncate:
       MPFR_ASSERTD(ax >= __gmpfr_emin);
-      MPFR_RET(-MPFR_SIGN(a));
+      MPFR_RET(-MPFR_SIGN_POS);
     }
   else /* round away from zero */
     {
@@ -108,12 +108,12 @@ mpfr_sqr_1 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
         {
           ap[0] = MPFR_LIMB_HIGHBIT;
           if (MPFR_UNLIKELY(ax + 1 > __gmpfr_emax))
-            return mpfr_overflow (a, rnd_mode, MPFR_SIGN(a));
+            return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
           MPFR_ASSERTD(ax + 1 <= __gmpfr_emax);
           MPFR_ASSERTD(ax + 1 >= __gmpfr_emin);
           MPFR_SET_EXP (a, ax + 1);
         }
-      MPFR_RET(MPFR_SIGN(a));
+      MPFR_RET(MPFR_SIGN_POS);
     }
 }
 
@@ -158,7 +158,7 @@ mpfr_sqr_2 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
 
   /* rounding */
   if (MPFR_UNLIKELY(ax > __gmpfr_emax))
-    return mpfr_overflow (a, rnd_mode, MPFR_SIGN(a));
+    return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
 
   /* Warning: underflow should be checked *after* rounding, thus when rounding
      away and when a > 0.111...111*2^(emin-1), or when rounding to nearest and
@@ -177,7 +177,7 @@ mpfr_sqr_2 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
           (ax < __gmpfr_emin - 1 ||
            (ap[1] == MPFR_LIMB_HIGHBIT && ap[0] == 0 && (rb | sb) == 0)))
             rnd_mode = MPFR_RNDZ;
-      return mpfr_underflow (a, rnd_mode, MPFR_SIGN(a));
+      return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
     }
 
  rounding:
@@ -199,7 +199,7 @@ mpfr_sqr_2 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
     {
     truncate:
       MPFR_ASSERTD(ax >= __gmpfr_emin);
-      MPFR_RET(-MPFR_SIGN(a));
+      MPFR_RET(-MPFR_SIGN_POS);
     }
   else /* round away from zero */
     {
@@ -210,15 +210,161 @@ mpfr_sqr_2 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
         {
           ap[1] = MPFR_LIMB_HIGHBIT;
           if (MPFR_UNLIKELY(ax + 1 > __gmpfr_emax))
-            return mpfr_overflow (a, rnd_mode, MPFR_SIGN(a));
+            return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
           MPFR_ASSERTD(ax + 1 <= __gmpfr_emax);
           MPFR_ASSERTD(ax + 1 >= __gmpfr_emin);
           MPFR_SET_EXP (a, ax + 1);
         }
-      MPFR_RET(MPFR_SIGN(a));
+      MPFR_RET(MPFR_SIGN_POS);
     }
 }
 
+/* Special code for 2*GMP_NUMB_BITS < prec(a) < 3*GMP_NUMB_BITS and
+   2*GMP_NUMB_BITS < prec(b) <= 3*GMP_NUMB_BITS. */
+static int
+mpfr_sqr_3 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
+{
+  mp_limb_t a0, a1, a2, h, l;
+  mpfr_limb_ptr ap = MPFR_MANT(a);
+  mpfr_exp_t ax = 2 * MPFR_GET_EXP(b);
+  mpfr_prec_t sh = 3 * GMP_NUMB_BITS - p;
+  mp_limb_t rb, sb, sb2, mask = MPFR_LIMB_MASK(sh);
+  mp_limb_t *bp = MPFR_MANT(b);
+
+  /* we store the upper 3-limb product in a2, a1, a0:
+     b2^2, 2*b2*b1, 2*b2*b0+b1^2 */
+
+  /* first compute b2*b1 and b2*b0, which will be shifted by 1 */
+  umul_ppmm (a1, a0, bp[2], bp[1]);
+  umul_ppmm (h, l, bp[2], bp[0]);
+  a0 += h;
+  a1 += (a0 < h);
+  /* now a1, a0 contains b2*b1 + floor(b2*b0/B): there can be no overflow
+     since b2*b1*B + b2*b0 <= b2*(b1*B+b0) <= b2*(B^2-1) < B^3 */
+
+  /* multiply a2, a1, a0 by 2 */
+  a2 = a1 >> (GMP_NUMB_BITS - 1);
+  a1 = (a1 << 1) | (a0 >> (GMP_NUMB_BITS - 1));
+  a0 = (a0 << 1);
+
+  /* add b2^2 */
+  umul_ppmm (h, l, bp[2], bp[2]);
+  a1 += l;
+  a2 += h + (a1 < l);
+
+  /* add b1^2 */
+  umul_ppmm (h, l, bp[1], bp[1]);
+  a0 += h;
+  a1 += (a0 < h);
+  a2 += (a1 == 0 && a0 < h);
+
+  /* Now the approximate product {a2, a1, a0} has an error of less than
+     5 ulps (3 ulps for the ignored low limbs of 2*b2*b0+b1^2,
+     plus 2 ulps for the ignored 2*b1*b0 (plus b0^2).
+     Since we might shift by 1 bit, we make sure the low sh-2 bits of a0
+     are not 0, -1, -2, -3 or -4. */
+
+  if (MPFR_LIKELY(((a0 + 4) & (mask >> 2)) > 4))
+    sb = sb2 = 1; /* result cannot be exact in that case */
+  else
+    {
+      mp_limb_t p[6];
+      mpn_sqr (p, bp, 3);
+      a2 = p[5];
+      a1 = p[4];
+      a0 = p[3];
+      sb = p[2];
+      sb2 = p[1] | p[0];
+    }
+  if (a2 < MPFR_LIMB_HIGHBIT)
+    {
+      ax --;
+      a2 = (a2 << 1) | (a1 >> (GMP_NUMB_BITS - 1));
+      a1 = (a1 << 1) | (a0 >> (GMP_NUMB_BITS - 1));
+      a0 = (a0 << 1) | (sb >> (GMP_NUMB_BITS - 1));
+      sb = sb << 1;
+      /* no need to shift sb2: we only need to know if it is zero or not */
+    }
+  ap[2] = a2;
+  ap[1] = a1;
+  rb = a0 & (MPFR_LIMB_ONE << (sh - 1));
+  sb |= ((a0 & mask) ^ rb) | sb2;
+  ap[0] = a0 & ~mask;
+
+  MPFR_SIGN(a) = MPFR_SIGN_POS;
+
+  /* rounding */
+  if (MPFR_UNLIKELY(ax > __gmpfr_emax))
+    return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
+
+  /* Warning: underflow should be checked *after* rounding, thus when rounding
+     away and when a > 0.111...111*2^(emin-1), or when rounding to nearest and
+     a >= 0.111...111[1]*2^(emin-1), there is no underflow. */
+  if (MPFR_UNLIKELY(ax < __gmpfr_emin))
+    {
+      if ((ax == __gmpfr_emin - 1) &&
+          (ap[2] == MPFR_LIMB_MAX) &&
+          (ap[1] == MPFR_LIMB_MAX) &&
+          (ap[0] == ~mask) &&
+          ((rnd_mode == MPFR_RNDN && rb) ||
+           (MPFR_IS_LIKE_RNDA(rnd_mode, MPFR_IS_NEG (a)) && (rb | sb))))
+        goto rounding; /* no underflow */
+      /* for RNDN, mpfr_underflow always rounds away, thus for |a| <= 2^(emin-2)
+         we have to change to RNDZ */
+      if (rnd_mode == MPFR_RNDN &&
+          (ax < __gmpfr_emin - 1 ||
+           (ap[2] == MPFR_LIMB_HIGHBIT && ap[1] == 0 && ap[0] == 0
+            && (rb | sb) == 0)))
+        rnd_mode = MPFR_RNDZ;
+      return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
+    }
+
+ rounding:
+  MPFR_EXP (a) = ax; /* Don't use MPFR_SET_EXP since ax might be < __gmpfr_emin
+                        in the cases "goto rounding" above. */
+  if (rb == 0 && sb == 0)
+    {
+      MPFR_ASSERTD(ax >= __gmpfr_emin);
+      return 0; /* idem than MPFR_RET(0) but faster */
+    }
+  else if (rnd_mode == MPFR_RNDN)
+    {
+      if (rb == 0 || (sb == 0 && (ap[0] & (MPFR_LIMB_ONE << sh)) == 0))
+        goto truncate;
+      else
+        goto add_one_ulp;
+    }
+  else if (MPFR_IS_LIKE_RNDZ(rnd_mode, MPFR_IS_NEG(a)))
+    {
+    truncate:
+      MPFR_ASSERTD(ax >= __gmpfr_emin);
+      MPFR_RET(-MPFR_SIGN_POS);
+    }
+  else /* round away from zero */
+    {
+    add_one_ulp:
+      ap[0] += MPFR_LIMB_ONE << sh;
+      ap[1] += (ap[0] == 0);
+      ap[2] += (ap[1] == 0) && (ap[0] == 0);
+      if (ap[2] == 0)
+        {
+          ap[2] = MPFR_LIMB_HIGHBIT;
+          if (MPFR_UNLIKELY(ax + 1 > __gmpfr_emax))
+            return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
+          MPFR_ASSERTD(ax + 1 <= __gmpfr_emax);
+          MPFR_ASSERTD(ax + 1 >= __gmpfr_emin);
+          MPFR_SET_EXP (a, ax + 1);
+        }
+      MPFR_RET(MPFR_SIGN_POS);
+    }
+}
+
+/* Note: mpfr_sqr will call mpfr_mul if bn > MPFR_SQR_THRESHOLD,
+   in order to use Mulders' mulhigh, which is handled only here
+   to avoid partial code duplication. There is some overhead due
+   to the additional tests, but slowdown should not be noticeable
+   as this code is not executed in very small precisions. */
+
 int
 mpfr_sqr (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode)
 {
@@ -226,7 +372,7 @@ mpfr_sqr (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode)
   mpfr_exp_t ax;
   mp_limb_t *tmp;
   mp_limb_t b1;
-  mpfr_prec_t bq;
+  mpfr_prec_t aq, bq;
   mp_size_t bn, tn;
   MPFR_TMP_DECL(marker);
 
@@ -250,14 +396,19 @@ mpfr_sqr (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode)
         ( MPFR_ASSERTD(MPFR_IS_ZERO(b)), MPFR_SET_ZERO(a) );
       MPFR_RET(0);
     }
-  bq = MPFR_PREC(b);
+  aq = MPFR_GET_PREC(a);
+  bq = MPFR_GET_PREC(b);
 
-  if (MPFR_GET_PREC(a) < GMP_NUMB_BITS && bq <= GMP_NUMB_BITS)
-    return mpfr_sqr_1 (a, b, rnd_mode, MPFR_GET_PREC(a));
+  if (aq < GMP_NUMB_BITS && bq <= GMP_NUMB_BITS)
+    return mpfr_sqr_1 (a, b, rnd_mode, aq);
 
-  if (GMP_NUMB_BITS < MPFR_GET_PREC(a) && MPFR_GET_PREC(a) < 2 * GMP_NUMB_BITS
+  if (GMP_NUMB_BITS < aq && aq < 2 * GMP_NUMB_BITS
       && GMP_NUMB_BITS < bq && bq <= 2 * GMP_NUMB_BITS)
-    return mpfr_sqr_2 (a, b, rnd_mode, MPFR_GET_PREC(a));
+    return mpfr_sqr_2 (a, b, rnd_mode, aq);
+
+  if (2 * GMP_NUMB_BITS < aq && aq < 3 * GMP_NUMB_BITS
+      && 2 * GMP_NUMB_BITS < bq && bq <= 3 * GMP_NUMB_BITS)
+    return mpfr_sqr_3 (a, b, rnd_mode, aq);
 
   ax = 2 * MPFR_GET_EXP (b);
   MPFR_ASSERTN (2 * (mpfr_uprec_t) bq <= MPFR_PREC_MAX);
@@ -287,8 +438,7 @@ mpfr_sqr (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode)
   if (MPFR_UNLIKELY(b1 == 0))
     mpn_lshift (tmp, tmp, tn, 1); /* tn <= k, so no stack corruption */
 
-  cc = mpfr_round_raw (MPFR_MANT (a), tmp, 2 * bq, 0,
-                       MPFR_PREC (a), rnd_mode, &inexact);
+  cc = mpfr_round_raw (MPFR_MANT (a), tmp, 2 * bq, 0, aq, rnd_mode, &inexact);
   /* cc = 1 ==> result is a power of two */
   if (MPFR_UNLIKELY(cc))
     MPFR_MANT(a)[MPFR_LIMB_SIZE(a)-1] = MPFR_LIMB_HIGHBIT;
author	zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4>	2017-01-09 14:41:14 +0000
committer	zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4>	2017-01-09 14:41:14 +0000
commit	85804140c8189414c428c3a86f767c258a72dff2 (patch)
tree	f0c7710f5b3f090416912716e0238ab58474cd7b /src/sqr.c
parent	24da13c98a29ce7e842cd440c6135efe435b4eab (diff)
download	mpfr-85804140c8189414c428c3a86f767c258a72dff2.tar.gz