3 files changed, 179 insertions, 21 deletions
diff --git a/algorithms.tex b/algorithms.tex
index 86a2cda44..11c98de76 100644
--- a/algorithms.tex
+++ b/algorithms.tex
@@ -541,6 +541,28 @@ where $b[i]$ and $c[i]$ is meant as $0$ for negative $i$,
 and $c[i]$ is meant as $0$ for $i \ge {\tt cn}$
 (${\tt cancel_b} \ge 0$, but ${\tt cancel_c}$ may be negative).
 
+\subsection{The {\tt mpfr\_mul} function}
+
+{\tt mpfr\_mul} uses two algorithms: if the precision of the operands
+is small enough, a plain multiplication using {\tt mpn\_mul} is used
+(there is no error, except in the final rounding);
+otherwise it uses {\tt mpfr\_mulhigh\_n}. 
+
+In this case, it trunks the two operands to $m$ limbs: 
+$1/2 \leq b < 1$ and $1/2 \leq c < 1$, $b = bh+bl$ and $c = ch+c$
+($B=2^{32} or 2^{64}$).
+The error comes from:
+\begin {itemize}
+\item Truncation: $ \leq bl.ch + bh.cl + bl.cl \leq bl + cl \leq 2 B^{-m}$
+\item Mulder: Assuming $error(Mulder(n)) \leq error(mulhigh_basecase(n))$
+  $error(mulhigh(n) \leq (n-1) (B-1)^2 B^{-n-2} + ... + 1 (B-1)^2 B^{-2 n}$
+  $ = \sum_{i=1}^{n-1}{(n-i) (B-1)^2 B^{-n-1-i}}$
+  $ = (B-1)^2 B^{-n-1} \sum_{i=1}^{n-1}{B^{-i}}$
+  $ = (b-1)^2 B^{-n-1} \frac{B^{1-n}-n+n B-B}{(1-B)^2}$
+  $ \leq n B^{-n}$
+\end {itemize}
+Total error: $\leq (m+2) B^{-m}$
+
 \subsection{The {\tt mpfr\_div} function}
 
 The goals of the code of the {\tt mpfr\_div} function include the fact
diff --git a/mul.c b/mul.c
index 7cadc6a93..bd98947a6 100644
--- a/mul.c
+++ b/mul.c
@@ -20,6 +20,7 @@ along with the MPFR Library; see the file COPYING.LIB.  If not, write to
 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 MA 02111-1307, USA. */
 
+#define MPFR_NEED_LONGLONG_H
 #include "mpfr-impl.h"
 
 int
@@ -113,6 +114,7 @@ mpfr_mul (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mp_rnd_t rnd_mode)
   tmp = (mp_limb_t *) TMP_ALLOC ((size_t) k * BYTES_PER_MP_LIMB);
 
   /* multiplies two mantissa in temporary allocated space */
+#if 0
   b1 = MPFR_LIKELY (bn >= cn)
     ? mpn_mul (tmp, MPFR_MANT (b), bn, MPFR_MANT (c), cn)
     : mpn_mul (tmp, MPFR_MANT (c), cn, MPFR_MANT (b), bn);
@@ -128,10 +130,71 @@ mpfr_mul (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mp_rnd_t rnd_mode)
   if (MPFR_UNLIKELY (b1 == 0))
     mpn_lshift (tmp, tmp, tn, 1); /* tn <= k, so no stack corruption */
 
+#else
+
+  if (MPFR_UNLIKELY (bn < cn)) 
+    {
+      mpfr_srcptr tmp = b;
+      mp_size_t tn  = bn;
+      b = c;
+      bn = cn;
+      c = tmp;
+      cn = tn;
+    }
+  if (MPFR_UNLIKELY (bn > MPFR_MUL_THRESHOLD))
+    {
+      mp_size_t n;
+      mp_prec_t p;
+
+      /* Compute estimated precision of mulhigh */
+      n = MPFR_LIMB_SIZE (a) + 1;
+      n = MIN (n, cn);
+      MPFR_ASSERTD (n >= 1 && 2*n <= k && n <= cn && n <= bn);
+      p = n*BITS_PER_MP_LIMB - MPFR_INT_CEIL_LOG2 (n + (n < cn) + (n < bn) );
+      if (MPFR_UNLIKELY (MPFR_PREC (a) > p - 4))
+	/* MulHigh can't produce a roundable result. */
+	goto full_multiply;
+      /* Compute an approximation of the product of b and c */
+      mpfr_mulhigh_n (tmp+k-2*n, MPFR_MANT (b) + bn - n, 
+		      MPFR_MANT (c) + cn - n, n);
+
+      /* now tmp[0]..tmp[k-1] contains the product of both mantissa,
+	 with tmp[k-1]>=2^(BITS_PER_MP_LIMB-2) */
+      b1 = tmp[k-1] >> (BITS_PER_MP_LIMB - 1); /* msb from the product */
+      
+      /* if the mantissas of b and c are uniformly distributed in ]1/2, 1],
+	 then their product is in ]1/4, 1/2] with probability 2*ln(2)-1 ~ 0.386
+	 and in [1/2, 1] with probability 2-2*ln(2) ~ 0.614 */
+      tmp += k - tn;
+      if (MPFR_UNLIKELY (b1 == 0))
+	mpn_lshift (tmp, tmp, tn, 1);
+
+      if (MPFR_UNLIKELY (!mpfr_can_round_raw (tmp, tn, sign, p + b1 - 1,
+	    GMP_RNDN, GMP_RNDZ, MPFR_PREC(a)+(rnd_mode==GMP_RNDN))))
+	goto full_multiply;
+      }
+  else
+    {
+    full_multiply:
+      b1 = mpn_mul (tmp, MPFR_MANT (b), bn, MPFR_MANT (c), cn);      
+
+      /* now tmp[0]..tmp[k-1] contains the product of both mantissa,
+	 with tmp[k-1]>=2^(BITS_PER_MP_LIMB-2) */
+      b1 >>= BITS_PER_MP_LIMB - 1; /* msb from the product */
+      
+      /* if the mantissas of b and c are uniformly distributed in ]1/2, 1],
+	 then their product is in ]1/4, 1/2] with probability 2*ln(2)-1 ~ 0.386
+	 and in [1/2, 1] with probability 2-2*ln(2) ~ 0.614 */
+      tmp += k - tn;
+      if (MPFR_UNLIKELY (b1 == 0))
+	mpn_lshift (tmp, tmp, tn, 1); /* tn <= k, so no stack corruption */
+    }
+#endif
+
   ax2 = ax + (mp_exp_t) (b1 - 1);
   MPFR_RNDRAW (inexact, a, tmp, bq+cq, rnd_mode, sign, ax2++);
   TMP_FREE (marker);
-  MPFR_EXP  (a) = ax2; /* Can't use MPFR_SET_EXP: Exponent may be out of range */
+  MPFR_EXP  (a) = ax2; /* Can't use MPFR_SET_EXP: Expo may be out of range */
   MPFR_SET_SIGN (a, sign);
   if (MPFR_UNLIKELY (ax2 > __gmpfr_emax))
     return mpfr_overflow (a, rnd_mode, sign);
diff --git a/tuneup.c b/tuneup.c
index 776934081..4b701a1be 100644
--- a/tuneup.c
+++ b/tuneup.c
@@ -30,9 +30,6 @@ MA 02111-1307, USA. */
 #define _PROTO __GMP_PROTO
 #include "speed.h"
 
-#define THRESHOLD_WINDOW 16
-#define THRESHOLD_FINAL_WINDOW 128
-
 int verbose;
 
 /* s->size: precision of both input and output
@@ -73,6 +70,43 @@ int verbose;
   return t;                                          \
 } while (0)
 
+#define SPEED_MPFR_OP(mean_fun) do {                 \
+  unsigned  i;                                       \
+  mp_ptr    wp;                                      \
+  double    t;                                       \
+  mpfr_t    w, x, y;                                 \
+  mp_size_t size;                                    \
+  TMP_DECL (marker);                                 \
+                                                     \
+  SPEED_RESTRICT_COND (s->size >= MPFR_PREC_MIN);    \
+  SPEED_RESTRICT_COND (s->size <= MPFR_PREC_MAX);    \
+  TMP_MARK (marker);                                 \
+                                                     \
+  size = (s->size-1)/BITS_PER_MP_LIMB+1;             \
+  s->xp[size-1] |= MPFR_LIMB_HIGHBIT;                \
+  MPFR_TMP_INIT1 (s->xp, x, s->size);                \
+  MPFR_SET_EXP (x, 0);                               \
+  s->yp[size-1] |= MPFR_LIMB_HIGHBIT;                \
+  MPFR_TMP_INIT1 (s->yp, y, s->size);                \
+  MPFR_SET_EXP (y, 0);                               \
+                                                     \
+  MPFR_TMP_INIT (wp, w, s->size, size);              \
+                                                     \
+  speed_operand_src (s, s->xp, size);                \
+  speed_operand_src (s, s->yp, size);                \
+  speed_operand_dst (s, wp, size);                   \
+  speed_cache_fill (s);                              \
+                                                     \
+  speed_starttime ();                                \
+  i = s->reps;                                       \
+  do                                                 \
+    mean_fun (w, x, y, GMP_RNDN);                    \
+  while (--i != 0);                                  \
+  t = speed_endtime ();                              \
+                                                     \
+  TMP_FREE (marker);                                 \
+  return t;                                          \
+} while (0)
 
 
 /* First we include all the functions we want to tune inside this program.
@@ -96,10 +130,22 @@ static double speed_mpfr_exp (struct speed_params *s) {
   SPEED_MPFR_FUNC (mpfr_exp);
 }
 
+/* Setup mpfr_mul */
+mp_prec_t mpfr_mul_threshold;
+#undef  MPFR_MUL_THRESHOLD
+#define MPFR_MUL_THRESHOLD mpfr_mul_threshold
+#include "mul.c"
+static double speed_mpfr_mul (struct speed_params *s) {
+  SPEED_MPFR_OP (mpfr_mul);
+}
+
 
 
-/* Common functions (inspired by GMP function) */
-static int analyze_data (double *dat, int ndat) {
+/************************************************
+ * Common functions (inspired by GMP function)  *
+ ************************************************/
+static int 
+analyze_data (double *dat, int ndat) {
   double  x, min_x;
   int     j, min_j;
 
@@ -122,6 +168,8 @@ static int analyze_data (double *dat, int ndat) {
   return min_j;
 }
 
+#define THRESHOLD_WINDOW 16
+#define THRESHOLD_FINAL_WINDOW 128
 static double domeasure (mp_prec_t *threshold, 
 			 double (*func) (struct speed_params *),
 			 mp_prec_t p)
@@ -133,19 +181,29 @@ static double domeasure (mp_prec_t *threshold,
   s.align_xp = s.align_yp = s.align_wp = 64;
   s.size = p;
   size = (p - 1)/BITS_PER_MP_LIMB+1;
-  s.xp = malloc (size*sizeof (mp_limb_t));
+  s.xp = malloc (2*size*sizeof (mp_limb_t));
   if (s.xp == NULL)
     {
       fprintf (stderr, "Can't allocate memory.\n");
       abort ();
     }
   mpn_random (s.xp, size);
+  s.yp = s.xp + size;
+  mpn_random (s.yp, size);
   *threshold = MPFR_PREC_MAX;
   t1 = speed_measure (func, &s);
-  if (t1 == -1.0) abort ();
-  *threshold = MPFR_PREC_MIN;
+  if (t1 == -1.0) 
+    {
+      fprintf (stderr, "Failed to measure function 1!\n");
+      abort ();
+    }
+  *threshold = /*MPFR_PREC_MIN*/0;
   t2 = speed_measure (func, &s);
-  if (t2 == -1.0) abort ();
+  if (t2 == -1.0) 
+    {
+      fprintf (stderr, "Failed to measure function 2!\n");
+      abort ();
+    }
   free (s.xp);
   /* t1 is the time of the first alog (used for low prec) */
   if (t2 >= t1)
@@ -178,6 +236,9 @@ tune_simple_func (mp_prec_t *threshold,
   pmin = p = pstart;
   d = domeasure (threshold, func, pmin);
   if (d < 0.0) {
+    if (verbose)
+      printf ("Oups: even for %lu, algo 2 seems to be faster!\n", 
+	      (unsigned long) pmin);
     *threshold = MPFR_PREC_MIN;
     return;
   }
@@ -231,19 +292,21 @@ tune_simple_func (mp_prec_t *threshold,
 	/* numpos == numneg ... */
 	if (++ try > 2) {
 	  *threshold = p;
+	  if (verbose)
+	    printf ("Quick find: %lu\n", *threshold);
 	  return ;
 	}      
     }
   
   /* Final tune... */
   if (verbose)
-    printf ("Finalizing in [%lu, %lu]...\n", pmin, pmax);
+    printf ("Finalizing in [%lu, %lu]... ", pmin, pmax);
   for (i = 0 ; i < THRESHOLD_FINAL_WINDOW+1 ; i++)
     measure[i] = domeasure (threshold, func, pmin+i);
   i = analyze_data (measure, THRESHOLD_FINAL_WINDOW+1);
   *threshold = pmin + i;
   if (verbose)
-    printf ("Threshold = %lu\n", *threshold);
+    printf ("%lu\n", *threshold);
   return;
 }
 
@@ -252,9 +315,9 @@ tune_simple_func (mp_prec_t *threshold,
 /************************************
  * Tune Mulder's mulhigh function   *
  ************************************/
-#define TOLERANCE 1.015
+#define TOLERANCE 1.02
 #ifndef MPFR_MULHIGH_SIZE
-# define MPFR_MULHIGH_SIZE 512
+# define MPFR_MULHIGH_SIZE 1024
 #endif
 #define MPFR_MULHIGH_TAB_SIZE MPFR_MULHIGH_SIZE
 #include "mulders.c"
@@ -263,7 +326,7 @@ static double speed_mpfr_mulhigh (struct speed_params *s) {
   SPEED_ROUTINE_MPN_MUL_N (mpfr_mulhigh_n);
 }
 
-/* Tune size N: to speed up! */
+/* Tune size N */
 static mp_size_t
 tune_mulder_upto (mp_size_t n)
 {
@@ -333,10 +396,12 @@ tune_mulder (FILE *f)
 
 
 
-/**************************************************
- *            Tune all the threshold of MPFR      *
- **************************************************/
-static void all (const char *filename) 
+/*******************************************************
+ *            Tune all the threshold of MPFR           *
+ * Warning: tune the function in their dependent order!*
+ *******************************************************/
+static void 
+all (const char *filename) 
 {
   FILE *f;
   time_t  start_time, end_time;
@@ -364,7 +429,7 @@ static void all (const char *filename)
 
   time (&start_time);
   tp = localtime (&start_time);
-  fprintf (f, "/* Generated by tuneup.c, %d-%02d-%02d, ",
+  fprintf (f, "/* Generated by MPFR's tuneup.c, %d-%02d-%02d, ",
 	  tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
 
 #ifdef __ICC
@@ -388,6 +453,14 @@ static void all (const char *filename)
   /* Tune mulhigh */
   tune_mulder (f);
 
+  /* Tune mpfr_mul (Threshold is limb size, but it doesn't matter too much */
+  if (verbose)
+    printf ("Tuning mpfr_mul...\n");
+  tune_simple_func (&mpfr_mul_threshold, speed_mpfr_mul, 
+		    MPFR_PREC_MIN);  
+  fprintf (f, "#define MPFR_MUL_THRESHOLD %lu\n", 
+	   (unsigned long) (mpfr_mul_threshold-1)/BITS_PER_MP_LIMB+1);
+
   /* Tune mpfr_exp_2 */
   if (verbose)
     printf ("Tuning mpfr_exp_2...\n");
@@ -426,7 +499,7 @@ int main (int argc, char *argv[])
   verbose = argc > 1;
 
   if (verbose)
-    printf ("Tuning MPFR.\n");
+    printf ("Tuning MPFR. It may take some times.\n");
 
   all ("mparam.h");