speedup in mpfr_fma and mpfr_fms

new functions mpfr_fmma and mpfr_fmms modified mbench/fma to compute b*c+c instead of b*b+c (b*c+d would be better) git-svn-id: svn://scm.gforge.inria.fr/svn/mpfr/trunk@9788 280ebfd0-de03-0410-8827-d642c229c3f4
author: zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> 2016-01-10 21:39:53 +0000
committer: zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> 2016-01-10 21:39:53 +0000
commit: da2e71e315004d24324ff9ae3cead2388b7eab81 (patch)
tree: e93d004f2172f8ad8d70b6c9ac5f08cf72a03346 /src/fma.c
parent: 92bf640eff4a209aaeb35cd0ad568b98c91f74a3 (diff)
download: mpfr-da2e71e315004d24324ff9ae3cead2388b7eab81.tar.gz
1 files changed, 35 insertions, 1 deletions
diff --git a/src/fma.c b/src/fma.c
index f6c2702b4..e6e8f2ad3 100644
--- a/src/fma.c
+++ b/src/fma.c
@@ -93,6 +93,7 @@ mpfr_fma_singular (mpfr_ptr s, mpfr_srcptr x, mpfr_srcptr y, mpfr_srcptr z,
     }
 }
 
+/* s <- x*y + z */
 int
 mpfr_fma (mpfr_ptr s, mpfr_srcptr x, mpfr_srcptr y, mpfr_srcptr z,
           mpfr_rnd_t rnd_mode)
@@ -115,10 +116,43 @@ mpfr_fma (mpfr_ptr s, mpfr_srcptr x, mpfr_srcptr y, mpfr_srcptr z,
                      MPFR_IS_SINGULAR(z) ))
     return mpfr_fma_singular (s, x, y, z, rnd_mode);
 
+  /* First deal with special case where prec(x) = prec(y) and x*y does
+     not overflow nor underflow. Do it only for small sizes since for large
+     sizes x*y is faster using Mulders' algorithm (as a rule of thumb,
+     we assume mpn_mul_n is faster up to 4*MPFR_MUL_THRESHOLD).
+     Since |EXP(x)|, |EXP(y)| < 2^(k-2) on a k-bit computer,
+     |EXP(x)+EXP(y)| < 2^(k-1), thus cannot overflow nor underflow. */
+  mp_size_t n = MPFR_LIMB_SIZE(x);
+  if (n <= 4 * MPFR_MUL_THRESHOLD && MPFR_PREC(x) == MPFR_PREC(y) &&
+      MPFR_EXP(x) + MPFR_EXP(y) <= __gmpfr_emax &&
+      MPFR_EXP(x) + MPFR_EXP(y) > __gmpfr_emin)
+    {
+      mp_size_t un = n + n;
+      mp_ptr up;
+      MPFR_TMP_DECL(marker);
+
+      MPFR_TMP_MARK(marker);
+      MPFR_TMP_INIT (up, u, un * GMP_NUMB_BITS, un);
+      up = MPFR_MANT(u);
+      /* multiply x*y exactly into u */
+      mpn_mul_n (up, MPFR_MANT(x), MPFR_MANT(y), n);
+      if ((up[un - 1] & MPFR_LIMB_HIGHBIT) == 0)
+        {
+          mpn_lshift (up, up, un, 1);
+          MPFR_EXP(u) = MPFR_EXP(x) + MPFR_EXP(y) - 1;
+        }
+      else
+        MPFR_EXP(u) = MPFR_EXP(x) + MPFR_EXP(y);
+      MPFR_SIGN(u) = MPFR_MULT_SIGN( MPFR_SIGN(x) , MPFR_SIGN(y) );
+      inexact = mpfr_add (s, u, z, rnd_mode);
+      MPFR_TMP_FREE(marker);
+      return inexact;
+    }
+
   /* If we take prec(u) >= prec(x) + prec(y), the product u <- x*y
      is exact, except in case of overflow or underflow. */
-  MPFR_SAVE_EXPO_MARK (expo);
   MPFR_GROUP_INIT_1 (group, MPFR_PREC(x) + MPFR_PREC(y), u);
+  MPFR_SAVE_EXPO_MARK (expo);
 
   if (MPFR_UNLIKELY (mpfr_mul (u, x, y, MPFR_RNDN)))
     {
author	zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4>	2016-01-10 21:39:53 +0000
committer	zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4>	2016-01-10 21:39:53 +0000
commit	da2e71e315004d24324ff9ae3cead2388b7eab81 (patch)
tree	e93d004f2172f8ad8d70b6c9ac5f08cf72a03346 /src/fma.c
parent	92bf640eff4a209aaeb35cd0ad568b98c91f74a3 (diff)
download	mpfr-da2e71e315004d24324ff9ae3cead2388b7eab81.tar.gz