speedup in mpfr_fma and mpfr_fms

new functions mpfr_fmma and mpfr_fmms modified mbench/fma to compute b*c+c instead of b*b+c (b*c+d would be better) git-svn-id: svn://scm.gforge.inria.fr/svn/mpfr/trunk@9788 280ebfd0-de03-0410-8827-d642c229c3f4
author: zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> 2016-01-10 21:39:53 +0000
committer: zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> 2016-01-10 21:39:53 +0000
commit: da2e71e315004d24324ff9ae3cead2388b7eab81 (patch)
tree: e93d004f2172f8ad8d70b6c9ac5f08cf72a03346
parent: 92bf640eff4a209aaeb35cd0ad568b98c91f74a3 (diff)
download: mpfr-da2e71e315004d24324ff9ae3cead2388b7eab81.tar.gz
7 files changed, 88 insertions, 5 deletions
diff --git a/NEWS b/NEWS
index f006866d1..68435d76d 100644
--- a/NEWS
+++ b/NEWS
@@ -40,6 +40,7 @@ Changes from versions 3.1.* to version 3.2.0:
   rounding to nearest-away (as defined in IEEE 754-2008).
 - New functions mpfr_nrandom and mpfr_erandom to generate random numbers
   following normal and exponential distributions respectively.
+- New functions mpfr_fmma and mpfr_fmms to compute a*b+c*d and a*b-c*d.
 - The behavior of the mpfr_set_exp function changed, as it could easily
   yield undefined behavior in some cases (this modifies both the API and
   the ABI).
@@ -62,7 +63,8 @@ Changes from versions 3.1.* to version 3.2.0:
 - New MPFRbench program (see the tools/bench directory).
 - Speedup in the mpfr_const_euler function (contributed by Fredrik Johansson),
   in the computation of Bernoulli numbers (used in mpfr_gamma, mpfr_li2,
-  mpfr_digamma, mpfr_lngamma and mpfr_lgamma), and in mpfr_div.
+  mpfr_digamma, mpfr_lngamma and mpfr_lgamma), in mpfr_div, in mpfr_fma
+  and mpfr_fms.
 - Bug fixes. In particular: a speed improvement when the --enable-assert
   or --enable-assert=full configure option is used with GCC; mpfr_get_str
   now sets the NaN flag on NaN input.
diff --git a/doc/mpfr.texi b/doc/mpfr.texi
index 6f688b1cf..222bf10cb 100644
--- a/doc/mpfr.texi
+++ b/doc/mpfr.texi
@@ -2151,6 +2151,17 @@ separate addition or subtraction.  That is, the fused operation matters only
 for rounding.
 @end deftypefun
 
+@deftypefun int mpfr_fmma (mpfr_t @var{rop}, mpfr_t @var{op1}, mpfr_t @var{op2}, mpfr_t @var{op3}, mpfr_t @var{op4}, mpfr_rnd_t @var{rnd})
+@deftypefunx int mpfr_fmms (mpfr_t @var{rop}, mpfr_t @var{op1}, mpfr_t @var{op2}, mpfr_t @var{op3}, mpfr_t @var{op4}, mpfr_rnd_t @var{rnd})
+Set @var{rop} to @math{(@var{op1} @GMPtimes{} @var{op2}) + (@var{op3} @GMPtimes{} @var{op4})}
+(resp.@: @math{(@var{op1} @GMPtimes{} @var{op2}) - (@var{op3} @GMPtimes{} @var{op4})})
+rounded in the direction @var{rnd}.
+In case the computation of @math{@var{op1} @GMPtimes{} @var{op2}} overflows or
+underflows (or that of @math{@var{op3} @GMPtimes{} @var{op4}}), the result
+@var{rop} is computed as if the two intermediate products were computed with
+rounding toward zero.
+@end deftypefun
+
 @deftypefun int mpfr_agm (mpfr_t @var{rop}, mpfr_t @var{op1}, mpfr_t @var{op2}, mpfr_rnd_t @var{rnd})
 Set @var{rop} to the arithmetic-geometric mean of @var{op1} and @var{op2},
 rounded in the direction @var{rnd}.
diff --git a/src/Makefile.am b/src/Makefile.am
index a1e0b433b..12478fdd7 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -59,7 +59,7 @@ buildopt.c digamma.c bernoulli.c isregular.c set_flt.c get_flt.c	\
 scale2.c set_z_exp.c ai.c gammaonethird.c ieee_floats.h			\
 grandom.c fpif.c set_float128.c get_float128.c rndna.c nrandom.c        \
 random_deviate.h random_deviate.c erandom.c mpfr-mini-gmp.c             \
-mpfr-mini-gmp.h
+mpfr-mini-gmp.h fmma.c
 
 libmpfr_la_LIBADD = @LIBOBJS@
 
diff --git a/src/fma.c b/src/fma.c
index f6c2702b4..e6e8f2ad3 100644
--- a/src/fma.c
+++ b/src/fma.c
@@ -93,6 +93,7 @@ mpfr_fma_singular (mpfr_ptr s, mpfr_srcptr x, mpfr_srcptr y, mpfr_srcptr z,
     }
 }
 
+/* s <- x*y + z */
 int
 mpfr_fma (mpfr_ptr s, mpfr_srcptr x, mpfr_srcptr y, mpfr_srcptr z,
           mpfr_rnd_t rnd_mode)
@@ -115,10 +116,43 @@ mpfr_fma (mpfr_ptr s, mpfr_srcptr x, mpfr_srcptr y, mpfr_srcptr z,
                      MPFR_IS_SINGULAR(z) ))
     return mpfr_fma_singular (s, x, y, z, rnd_mode);
 
+  /* First deal with special case where prec(x) = prec(y) and x*y does
+     not overflow nor underflow. Do it only for small sizes since for large
+     sizes x*y is faster using Mulders' algorithm (as a rule of thumb,
+     we assume mpn_mul_n is faster up to 4*MPFR_MUL_THRESHOLD).
+     Since |EXP(x)|, |EXP(y)| < 2^(k-2) on a k-bit computer,
+     |EXP(x)+EXP(y)| < 2^(k-1), thus cannot overflow nor underflow. */
+  mp_size_t n = MPFR_LIMB_SIZE(x);
+  if (n <= 4 * MPFR_MUL_THRESHOLD && MPFR_PREC(x) == MPFR_PREC(y) &&
+      MPFR_EXP(x) + MPFR_EXP(y) <= __gmpfr_emax &&
+      MPFR_EXP(x) + MPFR_EXP(y) > __gmpfr_emin)
+    {
+      mp_size_t un = n + n;
+      mp_ptr up;
+      MPFR_TMP_DECL(marker);
+
+      MPFR_TMP_MARK(marker);
+      MPFR_TMP_INIT (up, u, un * GMP_NUMB_BITS, un);
+      up = MPFR_MANT(u);
+      /* multiply x*y exactly into u */
+      mpn_mul_n (up, MPFR_MANT(x), MPFR_MANT(y), n);
+      if ((up[un - 1] & MPFR_LIMB_HIGHBIT) == 0)
+        {
+          mpn_lshift (up, up, un, 1);
+          MPFR_EXP(u) = MPFR_EXP(x) + MPFR_EXP(y) - 1;
+        }
+      else
+        MPFR_EXP(u) = MPFR_EXP(x) + MPFR_EXP(y);
+      MPFR_SIGN(u) = MPFR_MULT_SIGN( MPFR_SIGN(x) , MPFR_SIGN(y) );
+      inexact = mpfr_add (s, u, z, rnd_mode);
+      MPFR_TMP_FREE(marker);
+      return inexact;
+    }
+
   /* If we take prec(u) >= prec(x) + prec(y), the product u <- x*y
      is exact, except in case of overflow or underflow. */
-  MPFR_SAVE_EXPO_MARK (expo);
   MPFR_GROUP_INIT_1 (group, MPFR_PREC(x) + MPFR_PREC(y), u);
+  MPFR_SAVE_EXPO_MARK (expo);
 
   if (MPFR_UNLIKELY (mpfr_mul (u, x, y, MPFR_RNDN)))
     {
diff --git a/src/mpfr.h b/src/mpfr.h
index 8059b87ea..7aa29c614 100644
--- a/src/mpfr.h
+++ b/src/mpfr.h
@@ -781,6 +781,12 @@ __MPFR_DECLSPEC int mpfr_fma _MPFR_PROTO ((mpfr_ptr, mpfr_srcptr, mpfr_srcptr,
                                            mpfr_srcptr, mpfr_rnd_t));
 __MPFR_DECLSPEC int mpfr_fms _MPFR_PROTO ((mpfr_ptr, mpfr_srcptr, mpfr_srcptr,
                                            mpfr_srcptr, mpfr_rnd_t));
+__MPFR_DECLSPEC int mpfr_fmma _MPFR_PROTO ((mpfr_ptr, mpfr_srcptr, mpfr_srcptr,
+                                            mpfr_srcptr, mpfr_srcptr,
+                                            mpfr_rnd_t));
+__MPFR_DECLSPEC int mpfr_fmms _MPFR_PROTO ((mpfr_ptr, mpfr_srcptr, mpfr_srcptr,
+                                            mpfr_srcptr, mpfr_srcptr,
+                                            mpfr_rnd_t));
 __MPFR_DECLSPEC int mpfr_sum _MPFR_PROTO ((mpfr_ptr, mpfr_ptr *const,
                                            unsigned long, mpfr_rnd_t));
 
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 39b7580b7..c50659345 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -20,7 +20,7 @@ check_PROGRAMS = tversion tabort_prec_max tassert tabort_defalloc1	\
      tconst_euler tconst_log2 tconst_pi tcopysign tcos tcosh tcot	\
      tcoth tcsc tcsch td_div td_sub tdigamma tdim tdiv tdiv_d tdiv_ui	\
      teint teq terandom terandom_chisq terf texp texp10 texp2 texpm1	\
-     tfactorial tfits tfma tfmod tfms tfpif tfprintf tfrac tfrexp	\
+     tfactorial tfits tfma tfmma tfmod tfms tfpif tfprintf tfrac tfrexp	\
      tgamma tget_flt tget_d tget_d_2exp tget_f tget_ld_2exp		\
      tget_set_d64 tget_sj tget_str tget_z tgmpop tgrandom thyperbolic	\
      thypot tinp_str tj0 tj1 tjn tl2b tlgamma tli2 tlngamma tlog	\
diff --git a/tools/mbench/mfv5-mpfr.cc b/tools/mbench/mfv5-mpfr.cc
index ac90e9a7b..06c62b8b5 100644
--- a/tools/mbench/mfv5-mpfr.cc
+++ b/tools/mbench/mfv5-mpfr.cc
@@ -72,7 +72,34 @@ public:
 class mpfr_fma_test {
 public:
   int func(mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mp_rnd_t r) {
-    return mpfr_fma (a,b,b,c,r);
+    /* prefer mpfr_fma (a,b,c,c,r) which computes b*c + c to
+       mpfr_fma (a,b,b,c,r) which computes b*b + c, where b*b
+       might be computed by a square */
+    return mpfr_fma (a,b,c,c,r);
+  }
+};
+
+class mpfr_fms_test {
+public:
+  int func(mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mp_rnd_t r) {
+    /* prefer mpfr_fms (a,b,c,c,r) which computes b*c - c to
+       mpfr_fms (a,b,b,c,r) which computes b*b - c, where b*b
+       might be computed by a square */
+    return mpfr_fms (a,b,c,c,r);
+  }
+};
+
+class mpfr_fmma_test {
+public:
+  int func(mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mp_rnd_t r) {
+    return mpfr_fmma (a,b,b,c,c,r);
+  }
+};
+
+class mpfr_fmms_test {
+public:
+  int func(mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mp_rnd_t r) {
+    return mpfr_fmms (a,b,b,c,c,r);
   }
 };
 
@@ -197,6 +224,9 @@ static mpfr_test<mpfr_add_test> test1 ("mpfr_add");
 static mpfr_test<mpfr_sub_test> test2 ("mpfr_sub");
 static mpfr_test<mpfr_mul_test> test3 ("mpfr_mul");
 static mpfr_test<mpfr_fma_test> test10 ("mpfr_fma");
+static mpfr_test<mpfr_fms_test> test11 ("mpfr_fms");
+static mpfr_test<mpfr_fmma_test> test12 ("mpfr_fmma");
+static mpfr_test<mpfr_fmms_test> test13 ("mpfr_fmms");
 static mpfr_test<mpfr_div_test> test4 ("mpfr_div");
 static mpfr_test<mpfr_set_test> test5 ("mpfr_set");
author	zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4>	2016-01-10 21:39:53 +0000
committer	zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4>	2016-01-10 21:39:53 +0000
commit	da2e71e315004d24324ff9ae3cead2388b7eab81 (patch)
tree	e93d004f2172f8ad8d70b6c9ac5f08cf72a03346
parent	92bf640eff4a209aaeb35cd0ad568b98c91f74a3 (diff)
download	mpfr-da2e71e315004d24324ff9ae3cead2388b7eab81.tar.gz