diff options
author | zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> | 2000-09-29 16:04:19 +0000 |
---|---|---|
committer | zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> | 2000-09-29 16:04:19 +0000 |
commit | fa07a4955c4dcccd32d36636b32312bc05a7ecdf (patch) | |
tree | 7319ff8c7b0b0eadc7cdc611c54d9cd6b9830ed3 /exp2.c | |
parent | 0f62829e8de8a7a909e972ea22201cdbe60f9a9a (diff) | |
download | mpfr-fa07a4955c4dcccd32d36636b32312bc05a7ecdf.tar.gz |
new faster version with O(n^(1/3)*M(n)) algorithm
git-svn-id: svn://scm.gforge.inria.fr/svn/mpfr/trunk@773 280ebfd0-de03-0410-8827-d642c229c3f4
Diffstat (limited to 'exp2.c')
-rw-r--r-- | exp2.c | 246 |
1 files changed, 171 insertions, 75 deletions
@@ -1,6 +1,7 @@ -/* mpfr_exp -- exponential of a floating-point number +/* mpfr_exp2 -- exponential of a floating-point number + using Brent's algorithms in O(n^(1/2)*M(n)) and O(n^(1/3)*M(n)) -Copyright (C) 1999 PolKA project, Inria Lorraine and Loria +Copyright (C) 1999-2000 PolKA project, Inria Lorraine and Loria This file is part of the MPFR Library. @@ -25,13 +26,50 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" #include "mpfr.h" -int mpfr_exp2_aux (mpfr_t, mpfr_t, mpfr_t, int); -int mpfr_exp2_aux2 (mpfr_t, mpfr_t, mpfr_t, int); +int mpfr_exp2_aux (mpz_t, mpfr_t, int, int*); +int mpfr_exp2_aux2 (mpz_t, mpfr_t, int, int*); + +#define SWITCH 100 /* number of bits to switch from O(n^(1/2)*M(n)) method + to O(n^(1/3)*M(n)) method */ + +#define MY_INIT_MPZ(x, s) { \ + (x)->_mp_alloc = (s); \ + (x)->_mp_d = (mp_ptr) TMP_ALLOC((s)*BYTES_PER_MP_LIMB); \ + (x)->_mp_size = 0; } /* #define DEBUG */ #define LOG2 0.69314718055994528622 /* log(2) rounded to zero on 53 bits */ +/* if k = the number of bits of z > q, divides z by 2^(k-q) and returns k-q. + Otherwise do nothing and return 0. + */ +mp_exp_t mpz_normalize(mpz_t rop, mpz_t z, int q) +{ + int k; + + k = mpz_sizeinbase(z, 2); + if (k > q) { + mpz_div_2exp(rop, z, k-q); + return k-q; + } + else { + if (rop != z) mpz_set(rop, z); + return 0; + } +} + +/* if expz > target, shift z by (expz-target) bits to the left. + if expz < target, shift z by (target-expz) bits to the right. + Returns target. +*/ +int mpz_normalize2(mpz_t rop, mpz_t z, int expz, int target) +{ + if (target > expz) mpz_div_2exp(rop, z, target-expz); + else mpz_mul_2exp(rop, z, expz-target); + return target; +} + /* use Brent's formula exp(x) = (1+r+r^2/2!+r^3/3!+...)^(2^K)*2^n where x = n*log(2)+(2^K)*r together with Brent-Kung O(t^(1/2)) algorithm for the evaluation of @@ -47,8 +85,9 @@ mpfr_exp2(y, x, rnd_mode) mp_rnd_t rnd_mode; #endif { - int n, expx, K, precy, q, k, l, err; - mpfr_t r, s, t; + int n, expx, K, precy, q, k, l, err, exps; + mpfr_t r, s, t; mpz_t ss; + TMP_DECL(marker); if (FLAG_NAN(x)) { SET_NAN(y); return 1; } if (!NOTZERO(x)) { mpfr_set_ui(y, 1, GMP_RNDN); return 0; } @@ -75,8 +114,11 @@ mpfr_exp2(y, x, rnd_mode) n = (int) floor(mpfr_get_d(x)/LOG2); - K = (int) (precy<500) ? sqrt( (double) precy ) - : pow( 8.0 * (double) precy, 1.0/3.0); + /* for the O(n^(1/2)*M(n)) method, the Taylor series computation of + n/K terms costs about n/(2K) multiplications when computed in fixed + point */ + K = (int) (precy<SWITCH) ? sqrt( (double) (precy + 1)/ 2.0 ) + : pow( 4.0 * (double) precy, 1.0/3.0); l = (precy-1)/K + 1; err = K + (int) ceil(log(2.0*(double)l+18.0)/LOG2); /* add K extra bits, i.e. failure probability <= 1/2^K = O(1/precy) */ @@ -120,31 +162,31 @@ mpfr_exp2(y, x, rnd_mode) #endif mpfr_div_2exp(r, r, K, GMP_RNDU); /* r = (x-n*log(2))/2^K */ + TMP_MARK(marker); + MY_INIT_MPZ(ss, 3 + 2*((q-1)/BITS_PER_MP_LIMB)); + exps = mpz_set_fr(ss, s); /* s <- 1 + r/1! + r^2/2! + ... + r^l/l! */ - l = (precy<500) ? mpfr_exp2_aux(s, r, t, q) /* naive method */ - : mpfr_exp2_aux2(s, r, t, q); /* Brent/Kung method */ + l = (precy<SWITCH) ? mpfr_exp2_aux(ss, r, q, &exps) /* naive method */ + : mpfr_exp2_aux2(ss, r, q, &exps); /* Brent/Kung method */ #ifdef DEBUG printf("l=%d q=%d (K+l)*q^2=%1.3e\n", l, q, (K+l)*(double)q*q); #endif - /* add 2 ulp to take into account rest of summation */ - mpfr_add_one_ulp(s); - mpfr_add_one_ulp(s); - for (k=0;k<K;k++) { - mpfr_mul(s, s, s, GMP_RNDU); -#ifdef DEBUG - printf("k=%d s=%1.20e\n",k,mpfr_get_d(s)); -#endif + mpz_mul(ss, ss, ss); exps <<= 1; + exps += mpz_normalize(ss, ss, q); } + mpfr_set_z(s, ss, GMP_RNDN); EXP(s) += exps; if (n>0) mpfr_mul_2exp(s, s, n, GMP_RNDU); else mpfr_div_2exp(s, s, -n, GMP_RNDU); - /* error is at most 2^K*(2l+18) ulp */ - l = 2*l+17; k=0; while (l) { k++; l >>= 1; } - /* now k = ceil(log(2l+18)/log(2)) */ + /* error is at most 2^K*(3l*(l+1)) ulp for mpfr_exp2_aux */ + if (precy<SWITCH) l = 3*l*(l+1); + else l = l*(l+4); + k=0; while (l) { k++; l >>= 1; } + /* now k = ceil(log(error in ulps)/log(2)) */ K += k; #ifdef DEBUG printf("after mult. by 2^n:\n"); @@ -153,7 +195,7 @@ mpfr_exp2(y, x, rnd_mode) printf("err=%d bits\n", K); #endif - l = mpfr_can_round(s, q-K, GMP_RNDU, rnd_mode, precy); + l = mpfr_can_round(s, q-K, GMP_RNDN, rnd_mode, precy); if (l==0) { #ifdef DEBUG printf("not enough precision, use %d\n", q+BITS_PER_MP_LIMB); @@ -166,94 +208,148 @@ mpfr_exp2(y, x, rnd_mode) mpfr_set(y, s, rnd_mode); + TMP_FREE(marker); mpfr_clear(r); mpfr_clear(s); mpfr_clear(t); return 1; } /* s <- 1 + r/1! + r^2/2! + ... + r^l/l! while EXP(r^l/l!)+EXPR(r)>-q - using naive method with O(l) multiplications - and auxiliary variable t. - Return l + using naive method with O(l) multiplications. + Return the number of iterations l. + The absolute error on s is less than 3*l*(l+1)*2^(-q). + Version using fixed-point arithmetic with mpz instead + of mpfr for internal computations. + s must have at least qn+1 limbs (qn should be enough, but currently fails + since mpz_mul_2exp(s, s, q-1) reallocates qn+1 limbs) */ -int mpfr_exp2_aux(mpfr_t s, mpfr_t r, mpfr_t t, int q) +int mpfr_exp2_aux(mpz_t s, mpfr_t r, int q, int *exps) { - int expr, l; - - mpfr_set_ui(s, 1, GMP_RNDU); - mpfr_set_ui(t, 1, GMP_RNDU); + int l, dif, qn; + mpz_t t, rr; mp_exp_t expt, expr; + TMP_DECL(marker); - l = 1; expr = EXP(r); + TMP_MARK(marker); + qn = 1 + (q-1)/BITS_PER_MP_LIMB; + MY_INIT_MPZ(t, 2*qn+1); /* 2*qn+1 is neeeded since mpz_div_2exp may + need an extra limb */ + MY_INIT_MPZ(rr, qn+1); + mpz_set_ui(t, 1); expt=0; + mpz_set_ui(s, 1); mpz_mul_2exp(s, s, q-1); *exps = 1-q; /* s = 2^(q-1) */ + expr = mpz_set_fr(rr, r); /* no error here */ + + l = 0; do { - mpfr_mul(t, t, r, GMP_RNDU); - mpfr_div_ui(t, t, l, GMP_RNDU); - mpfr_add(s, s, t, GMP_RNDU); l++; - } while (EXP(t)+expr > -q); + mpz_mul(t, t, rr); expt=expt+expr; + dif = *exps + mpz_sizeinbase(s, 2) - expt - mpz_sizeinbase(t, 2); + /* truncates the bits of t which are < ulp(s) = 2^(1-q) */ + expt += mpz_normalize(t, t, q-dif); /* error at most 2^(1-q) */ + mpz_div_ui(t, t, l); /* error at most 2^(1-q) */ + /* the error wrt t^l/l! is here at most 3*l*ulp(s) */ +#ifdef DEBUG + if (expt != *exps) { + fprintf(stderr, "Error: expt != exps %d %d\n", expt, *exps); exit(1); + } +#endif + mpz_add(s, s, t); /* no error here: exact */ + /* ensures rr has the same size as t: after several shifts, the error + on rr is still at most ulp(t)=ulp(s) */ + expr += mpz_normalize(rr, rr, mpz_sizeinbase(t, 2)); + } while (mpz_cmp_ui(t, 0)); + TMP_FREE(marker); return l; } -#define MY_INIT(x, p, s) { \ - PREC(x) = p; \ - MANT(x) = (mp_ptr) TMP_ALLOC(s*BYTES_PER_MP_LIMB); \ - SIZE(x) = s; \ - EXP(x) = 0; } - /* s <- 1 + r/1! + r^2/2! + ... + r^l/l! while EXP(r^l/l!)+EXPR(r)>-q - using Brent/Kung method with O(sqrt(l)) multiplications - and auxiliary variable t. + using Brent/Kung method with O(sqrt(l)) multiplications. Return l. + Uses m multiplications of full size and 2l/m of decreasing size, + i.e. a total equivalent to about m+l/m full multiplications, + i.e. 2*sqrt(l) for m=sqrt(l). + Version using mpz. ss must have at least (sizer+1) limbs. + The error is bounded by (l^2+4*l) ulps where l is the return value. */ -int mpfr_exp2_aux2(mpfr_t s, mpfr_t r, mpfr_t t, int q) +int mpfr_exp2_aux2(mpz_t s, mpfr_t r, int q, int *exps) { - int expr, l, m, i, sizer; mpfr_t *R; mp_limb_t c; + int expr, l, m, i, sizer, *expR, expt, dif, ql; mp_limb_t c; + mpz_t t, *R, rr, tmp; TMP_DECL(marker); - mpfr_set_ui(s, 0, GMP_RNDU); - expr = EXP(r); - /* estimate value of l */ - l = q / (-expr); - m = (int) sqrt(2.0 * (double) l); + l = q / (-EXP(r)); + m = (int) sqrt((double) l); TMP_MARK(marker); - R = (mpfr_t*) TMP_ALLOC((m+1)*sizeof(mpfr_t)); /* R[i] stands for r^i */ - sizer = (PREC(r)+BITS_PER_MP_LIMB-1)/BITS_PER_MP_LIMB; - for (i=0;i<=m;i++) MY_INIT(R[i], PREC(r), sizer); - mpfr_mul(R[2], r, r, GMP_RNDU); - for (i=3;i<=m;i++) mpfr_mul(R[i], R[i-1], r, GMP_RNDU); - mpfr_set_ui(R[0], 1, GMP_RNDU); /* R[0]=1 */ - mpfr_set_ui(R[1], 1, GMP_RNDU); /* R[1] contains r^l/l! */ - - l = 0; + R = (mpz_t*) TMP_ALLOC((m+1)*sizeof(mpz_t)); /* R[i] stands for r^i */ + expR = (int*) TMP_ALLOC((m+1)*sizeof(int)); /* exponent for R[i] */ + sizer = 1 + (PREC(r)-1)/BITS_PER_MP_LIMB; + mpz_init(tmp); + MY_INIT_MPZ(rr, sizer+2); + MY_INIT_MPZ(t, 2*sizer); /* double size for products */ + mpz_set_ui(s, 0); *exps = 1-q; /* initialize s to zero, 1 ulp = 2^(1-q) */ + for (i=0;i<=m;i++) MY_INIT_MPZ(R[i], sizer+2); + expR[1] = mpz_set_fr(R[1], r); /* exact operation: no error */ + expR[1] = mpz_normalize2(R[1], R[1], expR[1], 1-q); /* error <= 1 ulp */ + mpz_mul(t, R[1], R[1]); /* err(t) <= 2 ulps */ + mpz_div_2exp(R[2], t, q-1); /* err(R[2]) <= 3 ulps */ + expR[2] = 1-q; + for (i=3;i<=m;i++) { + mpz_mul(t, R[i-1], R[1]); /* err(t) <= 2*i-2 */ + mpz_div_2exp(R[i], t, q-1); /* err(R[i]) <= 2*i-1 ulps */ + expR[i] = 1-q; + } + mpz_set_ui(R[0], 1); mpz_mul_2exp(R[0], R[0], q-1); expR[0]=1-q; /* R[0]=1 */ + mpz_set_ui(rr, 1); expr=0; /* rr contains r^l/l! */ + /* by induction: err(rr) <= 2*l ulps */ + + l = 0; + ql = q; /* precision used for current giant step */ do { - mpfr_set(t, R[m-1], GMP_RNDU); - /* computes t = 1 + r/(l+1) + ... + r^(m-1)*l!/(l+m-1)! */ - for (i=m-2;i>1;i--) { - mpfr_div_ui(t, t, l+i+1, GMP_RNDU); - mpfr_add(t, t, R[i], GMP_RNDU); + /* all R[i] must have exponent 1-ql */ + if (l) for (i=0;i<m;i++) + expR[i] = mpz_normalize2(R[i], R[i], expR[i], 1-ql); + /* the absolute error on R[i]*rr is still 2*i-1 ulps */ + expt = mpz_normalize2(t, R[m-1], expR[m-1], 1-ql); + /* err(t) <= 2*m-1 ulps */ + /* computes t = 1 + r/(l+1) + ... + r^(m-1)*l!/(l+m-1)! + using Horner's scheme */ + for (i=m-2;i>=0;i--) { + mpz_div_ui(t, t, l+i+1); /* err(t) += 1 ulp */ + mpz_add(t, t, R[i]); } - mpfr_div_ui(t, t, l+2, GMP_RNDU); - mpfr_add(t, t, r, GMP_RNDU); - mpfr_div_ui(t, t, l+1, GMP_RNDU); - mpfr_add(t, t, R[0], GMP_RNDU); + /* now err(t) <= (3m-2) ulps */ /* now multiplies t by r^l/l! and adds to s */ - mpfr_mul(t, t, R[1], GMP_RNDU); - mpfr_add(s, s, t, GMP_RNDU); + mpz_mul(t, t, rr); expt += expr; + expt = mpz_normalize2(t, t, expt, *exps); + /* err(t) <= (3m-1) + err_rr(l) <= (3m-2) + 2*l */ +#ifdef DEBUG + if (expt != *exps) { + fprintf(stderr, "Error: expt != exps %d %d\n", expt, *exps); exit(1); + } +#endif + mpz_add(s, s, t); /* no error here */ - /* updates R[1], should be done using binary splitting too */ - mpfr_mul(R[1], R[1], R[m], GMP_RNDU); + /* updates rr, the multiplication of the factors l+i could be done + using binary splitting too, but it is not sure it would save much */ + mpz_mul(t, rr, R[m]); /* err(t) <= err(rr) + 2m-1 */ + expr += expR[m]; + mpz_set_ui(tmp, 1); for (i=1,c=1;i<=m;i++) { if (l+i > ~((mp_limb_t)0)/c) { - mpfr_div_ui(R[1], R[1], c, GMP_RNDU); + mpz_mul_ui(tmp, tmp, c); c = l+i; } else c *= l+i; } - if (c != 1) mpfr_div_ui(R[1], R[1], c, GMP_RNDU); + if (c != 1) mpz_mul_ui(tmp, tmp, c); /* tmp is exact */ + mpz_fdiv_q(t, t, tmp); /* err(t) <= err(rr) + 2m */ + expr += mpz_normalize(rr, t, ql); /* err_rr(l+1) <= err_rr(l) + 2m+1 */ + ql = q - *exps - mpz_sizeinbase(s, 2) + expr + mpz_sizeinbase(rr, 2); l+=m; - } while (EXP(R[1]) > -q); + } while (expr+mpz_sizeinbase(rr, 2) > -q); TMP_FREE(marker); + mpz_clear(tmp); return l; } |