summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--algorithms.tex22
-rw-r--r--mul.c65
-rw-r--r--tuneup.c113
3 files changed, 179 insertions, 21 deletions
diff --git a/algorithms.tex b/algorithms.tex
index 86a2cda44..11c98de76 100644
--- a/algorithms.tex
+++ b/algorithms.tex
@@ -541,6 +541,28 @@ where $b[i]$ and $c[i]$ is meant as $0$ for negative $i$,
and $c[i]$ is meant as $0$ for $i \ge {\tt cn}$
(${\tt cancel_b} \ge 0$, but ${\tt cancel_c}$ may be negative).
+\subsection{The {\tt mpfr\_mul} function}
+
+{\tt mpfr\_mul} uses two algorithms: if the precision of the operands
+is small enough, a plain multiplication using {\tt mpn\_mul} is used
+(there is no error, except in the final rounding);
+otherwise it uses {\tt mpfr\_mulhigh\_n}.
+
+In this case, it trunks the two operands to $m$ limbs:
+$1/2 \leq b < 1$ and $1/2 \leq c < 1$, $b = bh+bl$ and $c = ch+c$
+($B=2^{32} or 2^{64}$).
+The error comes from:
+\begin {itemize}
+\item Truncation: $ \leq bl.ch + bh.cl + bl.cl \leq bl + cl \leq 2 B^{-m}$
+\item Mulder: Assuming $error(Mulder(n)) \leq error(mulhigh_basecase(n))$
+ $error(mulhigh(n) \leq (n-1) (B-1)^2 B^{-n-2} + ... + 1 (B-1)^2 B^{-2 n}$
+ $ = \sum_{i=1}^{n-1}{(n-i) (B-1)^2 B^{-n-1-i}}$
+ $ = (B-1)^2 B^{-n-1} \sum_{i=1}^{n-1}{B^{-i}}$
+ $ = (b-1)^2 B^{-n-1} \frac{B^{1-n}-n+n B-B}{(1-B)^2}$
+ $ \leq n B^{-n}$
+\end {itemize}
+Total error: $\leq (m+2) B^{-m}$
+
\subsection{The {\tt mpfr\_div} function}
The goals of the code of the {\tt mpfr\_div} function include the fact
diff --git a/mul.c b/mul.c
index 7cadc6a93..bd98947a6 100644
--- a/mul.c
+++ b/mul.c
@@ -20,6 +20,7 @@ along with the MPFR Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111-1307, USA. */
+#define MPFR_NEED_LONGLONG_H
#include "mpfr-impl.h"
int
@@ -113,6 +114,7 @@ mpfr_mul (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mp_rnd_t rnd_mode)
tmp = (mp_limb_t *) TMP_ALLOC ((size_t) k * BYTES_PER_MP_LIMB);
/* multiplies two mantissa in temporary allocated space */
+#if 0
b1 = MPFR_LIKELY (bn >= cn)
? mpn_mul (tmp, MPFR_MANT (b), bn, MPFR_MANT (c), cn)
: mpn_mul (tmp, MPFR_MANT (c), cn, MPFR_MANT (b), bn);
@@ -128,10 +130,71 @@ mpfr_mul (mpfr_ptr a, mpfr_srcptr b, mpfr_srcptr c, mp_rnd_t rnd_mode)
if (MPFR_UNLIKELY (b1 == 0))
mpn_lshift (tmp, tmp, tn, 1); /* tn <= k, so no stack corruption */
+#else
+
+ if (MPFR_UNLIKELY (bn < cn))
+ {
+ mpfr_srcptr tmp = b;
+ mp_size_t tn = bn;
+ b = c;
+ bn = cn;
+ c = tmp;
+ cn = tn;
+ }
+ if (MPFR_UNLIKELY (bn > MPFR_MUL_THRESHOLD))
+ {
+ mp_size_t n;
+ mp_prec_t p;
+
+ /* Compute estimated precision of mulhigh */
+ n = MPFR_LIMB_SIZE (a) + 1;
+ n = MIN (n, cn);
+ MPFR_ASSERTD (n >= 1 && 2*n <= k && n <= cn && n <= bn);
+ p = n*BITS_PER_MP_LIMB - MPFR_INT_CEIL_LOG2 (n + (n < cn) + (n < bn) );
+ if (MPFR_UNLIKELY (MPFR_PREC (a) > p - 4))
+ /* MulHigh can't produce a roundable result. */
+ goto full_multiply;
+ /* Compute an approximation of the product of b and c */
+ mpfr_mulhigh_n (tmp+k-2*n, MPFR_MANT (b) + bn - n,
+ MPFR_MANT (c) + cn - n, n);
+
+ /* now tmp[0]..tmp[k-1] contains the product of both mantissa,
+ with tmp[k-1]>=2^(BITS_PER_MP_LIMB-2) */
+ b1 = tmp[k-1] >> (BITS_PER_MP_LIMB - 1); /* msb from the product */
+
+ /* if the mantissas of b and c are uniformly distributed in ]1/2, 1],
+ then their product is in ]1/4, 1/2] with probability 2*ln(2)-1 ~ 0.386
+ and in [1/2, 1] with probability 2-2*ln(2) ~ 0.614 */
+ tmp += k - tn;
+ if (MPFR_UNLIKELY (b1 == 0))
+ mpn_lshift (tmp, tmp, tn, 1);
+
+ if (MPFR_UNLIKELY (!mpfr_can_round_raw (tmp, tn, sign, p + b1 - 1,
+ GMP_RNDN, GMP_RNDZ, MPFR_PREC(a)+(rnd_mode==GMP_RNDN))))
+ goto full_multiply;
+ }
+ else
+ {
+ full_multiply:
+ b1 = mpn_mul (tmp, MPFR_MANT (b), bn, MPFR_MANT (c), cn);
+
+ /* now tmp[0]..tmp[k-1] contains the product of both mantissa,
+ with tmp[k-1]>=2^(BITS_PER_MP_LIMB-2) */
+ b1 >>= BITS_PER_MP_LIMB - 1; /* msb from the product */
+
+ /* if the mantissas of b and c are uniformly distributed in ]1/2, 1],
+ then their product is in ]1/4, 1/2] with probability 2*ln(2)-1 ~ 0.386
+ and in [1/2, 1] with probability 2-2*ln(2) ~ 0.614 */
+ tmp += k - tn;
+ if (MPFR_UNLIKELY (b1 == 0))
+ mpn_lshift (tmp, tmp, tn, 1); /* tn <= k, so no stack corruption */
+ }
+#endif
+
ax2 = ax + (mp_exp_t) (b1 - 1);
MPFR_RNDRAW (inexact, a, tmp, bq+cq, rnd_mode, sign, ax2++);
TMP_FREE (marker);
- MPFR_EXP (a) = ax2; /* Can't use MPFR_SET_EXP: Exponent may be out of range */
+ MPFR_EXP (a) = ax2; /* Can't use MPFR_SET_EXP: Expo may be out of range */
MPFR_SET_SIGN (a, sign);
if (MPFR_UNLIKELY (ax2 > __gmpfr_emax))
return mpfr_overflow (a, rnd_mode, sign);
diff --git a/tuneup.c b/tuneup.c
index 776934081..4b701a1be 100644
--- a/tuneup.c
+++ b/tuneup.c
@@ -30,9 +30,6 @@ MA 02111-1307, USA. */
#define _PROTO __GMP_PROTO
#include "speed.h"
-#define THRESHOLD_WINDOW 16
-#define THRESHOLD_FINAL_WINDOW 128
-
int verbose;
/* s->size: precision of both input and output
@@ -73,6 +70,43 @@ int verbose;
return t; \
} while (0)
+#define SPEED_MPFR_OP(mean_fun) do { \
+ unsigned i; \
+ mp_ptr wp; \
+ double t; \
+ mpfr_t w, x, y; \
+ mp_size_t size; \
+ TMP_DECL (marker); \
+ \
+ SPEED_RESTRICT_COND (s->size >= MPFR_PREC_MIN); \
+ SPEED_RESTRICT_COND (s->size <= MPFR_PREC_MAX); \
+ TMP_MARK (marker); \
+ \
+ size = (s->size-1)/BITS_PER_MP_LIMB+1; \
+ s->xp[size-1] |= MPFR_LIMB_HIGHBIT; \
+ MPFR_TMP_INIT1 (s->xp, x, s->size); \
+ MPFR_SET_EXP (x, 0); \
+ s->yp[size-1] |= MPFR_LIMB_HIGHBIT; \
+ MPFR_TMP_INIT1 (s->yp, y, s->size); \
+ MPFR_SET_EXP (y, 0); \
+ \
+ MPFR_TMP_INIT (wp, w, s->size, size); \
+ \
+ speed_operand_src (s, s->xp, size); \
+ speed_operand_src (s, s->yp, size); \
+ speed_operand_dst (s, wp, size); \
+ speed_cache_fill (s); \
+ \
+ speed_starttime (); \
+ i = s->reps; \
+ do \
+ mean_fun (w, x, y, GMP_RNDN); \
+ while (--i != 0); \
+ t = speed_endtime (); \
+ \
+ TMP_FREE (marker); \
+ return t; \
+} while (0)
/* First we include all the functions we want to tune inside this program.
@@ -96,10 +130,22 @@ static double speed_mpfr_exp (struct speed_params *s) {
SPEED_MPFR_FUNC (mpfr_exp);
}
+/* Setup mpfr_mul */
+mp_prec_t mpfr_mul_threshold;
+#undef MPFR_MUL_THRESHOLD
+#define MPFR_MUL_THRESHOLD mpfr_mul_threshold
+#include "mul.c"
+static double speed_mpfr_mul (struct speed_params *s) {
+ SPEED_MPFR_OP (mpfr_mul);
+}
+
-/* Common functions (inspired by GMP function) */
-static int analyze_data (double *dat, int ndat) {
+/************************************************
+ * Common functions (inspired by GMP function) *
+ ************************************************/
+static int
+analyze_data (double *dat, int ndat) {
double x, min_x;
int j, min_j;
@@ -122,6 +168,8 @@ static int analyze_data (double *dat, int ndat) {
return min_j;
}
+#define THRESHOLD_WINDOW 16
+#define THRESHOLD_FINAL_WINDOW 128
static double domeasure (mp_prec_t *threshold,
double (*func) (struct speed_params *),
mp_prec_t p)
@@ -133,19 +181,29 @@ static double domeasure (mp_prec_t *threshold,
s.align_xp = s.align_yp = s.align_wp = 64;
s.size = p;
size = (p - 1)/BITS_PER_MP_LIMB+1;
- s.xp = malloc (size*sizeof (mp_limb_t));
+ s.xp = malloc (2*size*sizeof (mp_limb_t));
if (s.xp == NULL)
{
fprintf (stderr, "Can't allocate memory.\n");
abort ();
}
mpn_random (s.xp, size);
+ s.yp = s.xp + size;
+ mpn_random (s.yp, size);
*threshold = MPFR_PREC_MAX;
t1 = speed_measure (func, &s);
- if (t1 == -1.0) abort ();
- *threshold = MPFR_PREC_MIN;
+ if (t1 == -1.0)
+ {
+ fprintf (stderr, "Failed to measure function 1!\n");
+ abort ();
+ }
+ *threshold = /*MPFR_PREC_MIN*/0;
t2 = speed_measure (func, &s);
- if (t2 == -1.0) abort ();
+ if (t2 == -1.0)
+ {
+ fprintf (stderr, "Failed to measure function 2!\n");
+ abort ();
+ }
free (s.xp);
/* t1 is the time of the first alog (used for low prec) */
if (t2 >= t1)
@@ -178,6 +236,9 @@ tune_simple_func (mp_prec_t *threshold,
pmin = p = pstart;
d = domeasure (threshold, func, pmin);
if (d < 0.0) {
+ if (verbose)
+ printf ("Oups: even for %lu, algo 2 seems to be faster!\n",
+ (unsigned long) pmin);
*threshold = MPFR_PREC_MIN;
return;
}
@@ -231,19 +292,21 @@ tune_simple_func (mp_prec_t *threshold,
/* numpos == numneg ... */
if (++ try > 2) {
*threshold = p;
+ if (verbose)
+ printf ("Quick find: %lu\n", *threshold);
return ;
}
}
/* Final tune... */
if (verbose)
- printf ("Finalizing in [%lu, %lu]...\n", pmin, pmax);
+ printf ("Finalizing in [%lu, %lu]... ", pmin, pmax);
for (i = 0 ; i < THRESHOLD_FINAL_WINDOW+1 ; i++)
measure[i] = domeasure (threshold, func, pmin+i);
i = analyze_data (measure, THRESHOLD_FINAL_WINDOW+1);
*threshold = pmin + i;
if (verbose)
- printf ("Threshold = %lu\n", *threshold);
+ printf ("%lu\n", *threshold);
return;
}
@@ -252,9 +315,9 @@ tune_simple_func (mp_prec_t *threshold,
/************************************
* Tune Mulder's mulhigh function *
************************************/
-#define TOLERANCE 1.015
+#define TOLERANCE 1.02
#ifndef MPFR_MULHIGH_SIZE
-# define MPFR_MULHIGH_SIZE 512
+# define MPFR_MULHIGH_SIZE 1024
#endif
#define MPFR_MULHIGH_TAB_SIZE MPFR_MULHIGH_SIZE
#include "mulders.c"
@@ -263,7 +326,7 @@ static double speed_mpfr_mulhigh (struct speed_params *s) {
SPEED_ROUTINE_MPN_MUL_N (mpfr_mulhigh_n);
}
-/* Tune size N: to speed up! */
+/* Tune size N */
static mp_size_t
tune_mulder_upto (mp_size_t n)
{
@@ -333,10 +396,12 @@ tune_mulder (FILE *f)
-/**************************************************
- * Tune all the threshold of MPFR *
- **************************************************/
-static void all (const char *filename)
+/*******************************************************
+ * Tune all the threshold of MPFR *
+ * Warning: tune the function in their dependent order!*
+ *******************************************************/
+static void
+all (const char *filename)
{
FILE *f;
time_t start_time, end_time;
@@ -364,7 +429,7 @@ static void all (const char *filename)
time (&start_time);
tp = localtime (&start_time);
- fprintf (f, "/* Generated by tuneup.c, %d-%02d-%02d, ",
+ fprintf (f, "/* Generated by MPFR's tuneup.c, %d-%02d-%02d, ",
tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
#ifdef __ICC
@@ -388,6 +453,14 @@ static void all (const char *filename)
/* Tune mulhigh */
tune_mulder (f);
+ /* Tune mpfr_mul (Threshold is limb size, but it doesn't matter too much */
+ if (verbose)
+ printf ("Tuning mpfr_mul...\n");
+ tune_simple_func (&mpfr_mul_threshold, speed_mpfr_mul,
+ MPFR_PREC_MIN);
+ fprintf (f, "#define MPFR_MUL_THRESHOLD %lu\n",
+ (unsigned long) (mpfr_mul_threshold-1)/BITS_PER_MP_LIMB+1);
+
/* Tune mpfr_exp_2 */
if (verbose)
printf ("Tuning mpfr_exp_2...\n");
@@ -426,7 +499,7 @@ int main (int argc, char *argv[])
verbose = argc > 1;
if (verbose)
- printf ("Tuning MPFR.\n");
+ printf ("Tuning MPFR. It may take some times.\n");
all ("mparam.h");