diff options
-rw-r--r-- | NEWS | 1 | ||||
-rw-r--r-- | src/const_euler.c | 371 |
2 files changed, 199 insertions, 173 deletions
@@ -46,6 +46,7 @@ Changes from versions 3.1.* to version 3.2.0: - Dropped K&R C compatibility. - Improved MPFR manual. - New MPFRbench program (see the tools/bench directory). +- Speedup in the mpfr_const_euler function (contributed by Fredrik Johansson). - Bug fixes. In particular, a speed improvement when the --enable-assert or --enable-assert=full configure option is used with GCC. diff --git a/src/const_euler.c b/src/const_euler.c index db033d80b..6e3c3cd37 100644 --- a/src/const_euler.c +++ b/src/const_euler.c @@ -1,7 +1,7 @@ /* mpfr_const_euler -- Euler's constant Copyright 2001-2014 Free Software Foundation, Inc. -Contributed by the AriC and Caramel projects, INRIA. +Contributed by Fredrik Johansson. This file is part of the GNU MPFR Library. @@ -20,6 +20,9 @@ along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ +/* The approximation error bound uses Theorem 1 and Remark 2 in + http://arxiv.org/pdf/1312.0039v1.pdf */ + #define MPFR_NEED_LONGLONG_H #include "mpfr-impl.h" @@ -34,219 +37,241 @@ mpfr_const_euler (mpfr_ptr x, mpfr_rnd_t rnd_mode) { } -static void mpfr_const_euler_S2 (mpfr_ptr, mpfr_t); -static void mpfr_const_euler_R (mpfr_ptr, unsigned long); - -int -mpfr_const_euler_internal (mpfr_t x, mpfr_rnd_t rnd) +typedef struct { - mpfr_prec_t prec = MPFR_PREC(x), m, log2m; - mpfr_t y, z, l2; - unsigned long n; - int inexact; - MPFR_ZIV_DECL (loop); + mpz_t P; + mpz_t Q; + mpz_t T; + mpz_t C; + mpz_t D; + mpz_t V; +} mpfr_const_euler_bs_struct; - MPFR_LOG_FUNC ( - ("rnd=%d", rnd), - ("x[%Pu]=%.*Rg inex=%d", mpfr_get_prec(x), mpfr_log_prec, x, inexact)); +typedef mpfr_const_euler_bs_struct mpfr_const_euler_bs_t[1]; - log2m = MPFR_INT_CEIL_LOG2 (prec); - m = MPFR_ADD_PREC (prec, 2 * log2m + 23); - - mpfr_init2 (y, m); - mpfr_init2 (z, m); +static void +mpfr_const_euler_bs_init (mpfr_const_euler_bs_t s) +{ + mpz_init (s->P); + mpz_init (s->Q); + mpz_init (s->T); + mpz_init (s->C); + mpz_init (s->D); + mpz_init (s->V); +} - mpfr_init2 (l2, sizeof (mpfr_prec_t) * CHAR_BIT); - mpfr_const_log2 (l2, MPFR_RNDU); - mpfr_div_2ui (l2, l2, 1, MPFR_RNDU); /* log(2)/2, rounded upward */ +static void +mpfr_const_euler_bs_clear (mpfr_const_euler_bs_t s) +{ + mpz_clear (s->P); + mpz_clear (s->Q); + mpz_clear (s->T); + mpz_clear (s->C); + mpz_clear (s->D); + mpz_clear (s->V); +} - MPFR_ZIV_INIT (loop, m); - for (;;) +static void +mpfr_const_euler_bs_1 (mpfr_const_euler_bs_t s, + unsigned long n1, unsigned long n2, unsigned long N, + int cont) +{ + if (n2 - n1 == 1) { - mpfr_exp_t exp_S, err; - MPFR_BLOCK_DECL (flags); - - /* since prec >= 1, we have m >= 24 here, which ensures n >= 9 below */ - /* Compute n >= prec * log(2)/2 (see algorithms.tex). */ - mpfr_set_exp_t (z, m, MPFR_RNDU); - mpfr_mul (z, z, l2, MPFR_RNDU); - mpfr_ceil (z, z); - MPFR_LOG_VAR (z); - MPFR_BLOCK (flags, n = mpfr_get_ui (z, MPFR_RNDN)); - MPFR_ASSERTD (n >= 9 && ! MPFR_ERANGEFLAG (flags)); - mpfr_const_euler_S2 (y, z); /* error <= 3 ulps */ - exp_S = MPFR_EXP(y); - mpfr_log (z, z, MPFR_RNDD); /* error <= 1 ulp */ - mpfr_sub (y, y, z, MPFR_RNDN); /* S'(n) - log(n) */ - /* the error is less than 1/2 + 3*2^(exp_S-EXP(y)) + 2^(EXP(z)-EXP(y)) - <= 1/2 + 2^(exp_S+2-EXP(y)) + 2^(EXP(z)-EXP(y)) - <= 1/2 + 2^(1+MAX(exp_S+2,EXP(z))-EXP(y)) */ - err = 1 + MAX(exp_S + 2, MPFR_EXP(z)) - MPFR_EXP(y); - err = (err >= -1) ? err + 1 : 0; /* error <= 2^err ulp(y) */ - exp_S = MPFR_EXP(y); - mpfr_const_euler_R (z, n); /* err <= ulp(1/2) = 2^(-m) */ - mpfr_sub (y, y, z, MPFR_RNDN); - /* err <= 1/2 ulp(y) + 2^(-m) + 2^(err + exp_S - EXP(y)) ulp(y). - Since the result is between 0.5 and 1, ulp(y) = 2^(-m). - So we get 3/2*ulp(y) + 2^(err + exp_S - EXP(y)) ulp(y). - 3/2 + 2^e <= 2^(e+1) for e>=1, and <= 2^2 otherwise */ - err = err + exp_S - MPFR_EXP(y); - err = (err >= 1) ? err + 1 : 2; - if (MPFR_LIKELY (MPFR_CAN_ROUND (y, m - err, prec, rnd))) - break; - MPFR_ZIV_NEXT (loop, m); - mpfr_set_prec (y, m); - mpfr_set_prec (z, m); + mpz_set_ui (s->P, N); + mpz_mul (s->P, s->P, s->P); + mpz_set_ui (s->Q, n1 + 1); + mpz_mul (s->Q, s->Q, s->Q); + mpz_set_ui (s->C, 1); + mpz_set_ui (s->D, n1 + 1); + mpz_set (s->T, s->P); + mpz_set (s->V, s->P); } - MPFR_ZIV_FREE (loop); + else + { + mpfr_const_euler_bs_t L, R; + mpz_t t, u, v; + unsigned long m = (n1 + n2) / 2; - inexact = mpfr_set (x, y, rnd); + mpfr_const_euler_bs_init (L); + mpfr_const_euler_bs_init (R); + mpfr_const_euler_bs_1 (L, n1, m, N, 1); + mpfr_const_euler_bs_1 (R, m, n2, N, 1); - mpfr_clear (y); - mpfr_clear (z); - mpfr_clear (l2); + mpz_init (t); + mpz_init (u); + mpz_init (v); - return inexact; /* always inexact */ + if (cont) + mpz_mul (s->P, L->P, R->P); + + mpz_mul (s->Q, L->Q, R->Q); + mpz_mul (s->D, L->D, R->D); + + /* T = LP RT + RQ LT*/ + mpz_mul (t, L->P, R->T); + mpz_mul (v, R->Q, L->T); + mpz_add (s->T, t, v); + + /* C = LC RD + RC LD */ + if (cont) + { + mpz_mul (s->C, L->C, R->D); + mpz_addmul (s->C, R->C, L->D); + } + + /* V = RD (RQ LV + LC LP RT) + LD LP RV */ + mpz_mul (u, L->P, R->V); + mpz_mul (u, u, L->D); + mpz_mul (v, R->Q, L->V); + mpz_addmul (v, t, L->C); + mpz_mul (v, v, R->D); + mpz_add (s->V, u, v); + + mpfr_const_euler_bs_clear (L); + mpfr_const_euler_bs_clear (R); + mpz_clear (t); + mpz_clear (u); + mpz_clear (v); + } } static void -mpfr_const_euler_S2_aux (mpz_t P, mpz_t Q, mpz_t T, mpfr_t n, - unsigned long a, unsigned long b, int need_P) +mpfr_const_euler_bs_2 (mpz_t P, mpz_t Q, mpz_t T, + unsigned long n1, unsigned long n2, unsigned long N, + int cont) { - if (a + 1 == b) + if (n2 - n1 == 1) { - int inex; - MPFR_BLOCK_DECL (flags); - - MPFR_BLOCK (flags, MPFR_DBGRES (inex = mpfr_get_z (P, n, MPFR_RNDN))); - MPFR_ASSERTD (inex == 0 && ! MPFR_ERANGEFLAG (flags)); - if (a > 1) - mpz_mul_si (P, P, 1 - (long) a); + if (n1 == 0) + { + mpz_set_ui (P, 1); + mpz_set_ui (Q, 4 * N); + } + else + { + mpz_set_ui (P, 2 * n1 - 1); + mpz_pow_ui (P, P, 3); + mpz_set_ui (Q, 32 * n1); + mpz_mul_ui (Q, Q, N); + mpz_mul_ui (Q, Q, N); + } mpz_set (T, P); - mpz_set_ui (Q, a); - mpz_mul_ui (Q, Q, a); } else { - unsigned long c = (a + b) / 2; mpz_t P2, Q2, T2; - mpfr_const_euler_S2_aux (P, Q, T, n, a, c, 1); + unsigned long m = (n1 + n2) / 2; + mpz_init (P2); mpz_init (Q2); mpz_init (T2); - mpfr_const_euler_S2_aux (P2, Q2, T2, n, c, b, 1); + mpfr_const_euler_bs_2 (P, Q, T, n1, m, N, 1); + mpfr_const_euler_bs_2 (P2, Q2, T2, m, n2, N, 1); mpz_mul (T, T, Q2); mpz_mul (T2, T2, P); mpz_add (T, T, T2); - if (need_P) + if (cont) mpz_mul (P, P, P2); mpz_mul (Q, Q, Q2); mpz_clear (P2); mpz_clear (Q2); mpz_clear (T2); - /* divide by 2 if possible */ - { - unsigned long v2; - v2 = mpz_scan1 (P, 0); - c = mpz_scan1 (Q, 0); - if (c < v2) - v2 = c; - c = mpz_scan1 (T, 0); - if (c < v2) - v2 = c; - if (v2) - { - mpz_tdiv_q_2exp (P, P, v2); - mpz_tdiv_q_2exp (Q, Q, v2); - mpz_tdiv_q_2exp (T, T, v2); - } - } } } -/* computes S(n) = sum(n^k*(-1)^(k-1)/k!/k, k=1..ceil(4.319136566 * n)) - using binary splitting. - We have S(n) = sum(f(k), k=1..N) with N=ceil(4.319136566 * n) - and f(k) = n^k*(-1)*(k-1)/k!/k, - thus f(k)/f(k-1) = -n*(k-1)/k^2 -*/ -static void -mpfr_const_euler_S2 (mpfr_t x, mpfr_t n) -{ - mpz_t P, Q, T; - unsigned long N; - mpfr_t NN; - MPFR_BLOCK_DECL (flags); - - mpfr_init2 (NN, 64); - mpfr_set_str_binary (NN, /* a+2 = a*log(a), rounded to +infinity */ - "100.01010001101100101110111100011011001011011111001111001111"); - mpfr_mul (NN, NN, n, MPFR_RNDU); - MPFR_LOG_VAR (NN); - /* N = ceil(a.n) upper bound */ - MPFR_BLOCK (flags, N = mpfr_get_ui (NN, MPFR_RNDU)); - MPFR_ASSERTD (! MPFR_ERANGEFLAG (flags)); - mpz_init (P); - mpz_init (Q); - mpz_init (T); - mpfr_const_euler_S2_aux (P, Q, T, n, 1, N + 1, 0); - mpfr_set_z (x, T, MPFR_RNDN); - mpfr_div_z (x, x, Q, MPFR_RNDN); - mpz_clear (P); - mpz_clear (Q); - mpz_clear (T); - mpfr_clear (NN); -} - -/* computes R(n) = exp(-n)/n * sum(k!/(-n)^k, k=0..n-2) - with error at most 4*ulp(x). Assumes n>=2. - Since x <= exp(-n)/n <= 1/8, then 4*ulp(x) <= ulp(1). -*/ -static void -mpfr_const_euler_R (mpfr_t x, unsigned long n) +int +mpfr_const_euler_internal (mpfr_t x, mpfr_rnd_t rnd) { - unsigned long k, m; - mpz_t a, s; + mpfr_const_euler_bs_t sum; + mpz_t t, u, v; + unsigned long n, N; + mpfr_prec_t prec, wp, magn; mpfr_t y; + int inexact; + MPFR_ZIV_DECL (loop); - MPFR_ASSERTN (n >= 2); /* ensures sum(k!/(-n)^k, k=0..n-2) >= 2/3 */ - - /* as we multiply the sum by exp(-n), we need only PREC(x) - n/LOG2 bits */ - m = MPFR_PREC(x) - (unsigned long) ((double) n / LOG2); + prec = mpfr_get_prec (x); + wp = prec + 24; - mpz_init_set_ui (a, 1); - mpz_mul_2exp (a, a, m); - mpz_init_set (s, a); + mpfr_init2 (y, wp); + mpfr_const_euler_bs_init (sum); + mpz_init (t); + mpz_init (u); + mpz_init (v); - for (k = 1; k <= n; k++) + MPFR_ZIV_INIT (loop, wp); + for (;;) { - mpz_mul_ui (a, a, k); - mpz_fdiv_q_ui (a, a, n); - /* the error e(k) on a is e(k) <= 1 + k/n*e(k-1) with e(0)=0, - i.e. e(k) <= k */ - if (k % 2) - mpz_sub (s, s, a); - else - mpz_add (s, s, a); + /* The approximation error is bounded by 24 exp(-8n) when + n > 1, which is smaller than 2^-wp if + n > (wp + log_2(24)) * (log(2)/8). + Note log2(24) < 5 and log(2)/8 < 866434 / 10000000. */ + mpz_set_ui (t, wp + 5); + mpz_mul_ui (t, t, 866434); + mpz_cdiv_q_ui (t, t, 10000000); + n = mpz_get_ui (t); + + /* It is sufficient to take N >= alpha*n + 1 + where alpha = 3/LambertW(3/e) = 4.970625759544... */ + mpz_set_ui (t, n); + mpz_mul_ui (t, t, 4970626); + mpz_cdiv_q_ui (t, t, 1000000); + mpz_add_ui (t, t, 1); + N = mpz_get_ui (t); + + /* V / ((T + Q) * D) = S / I + where S = sum_{k=0}^{N-1} H_k n^(2k) / (k!)^2, + I = sum_{k=0}^{N-1} n^(2k) / (k!)^2 */ + mpfr_const_euler_bs_1 (sum, 0, N, n, 0); + mpz_add (sum->T, sum->T, sum->Q); + mpz_mul (t, sum->T, sum->D); + mpz_mul_2exp (u, sum->V, wp); + mpz_tdiv_q (v, u, t); + /* v * 2^-wp = S/I with error < 1 */ + + /* C / (D * V) = U where + U = (1/(4n)) sum_{k=0}^{2n-1} [(2k)!]^3 / ((k!)^4 8^(2k) (2n)^(2k)) */ + mpfr_const_euler_bs_2 (sum->C, sum->D, sum->V, 0, 2*n, n, 0); + mpz_mul (t, sum->Q, sum->Q); + mpz_mul (t, t, sum->V); + mpz_mul (u, sum->T, sum->T); + mpz_mul (u, u, sum->D); + mpz_mul_2exp (t, t, wp); + mpz_div (t, t, u); + /* t * 2^-wp = U/I^2 with error < 1 */ + + /* gamma = S/I - U/I^2 - log(n) with error at most 2^-wp */ + mpz_sub (v, v, t); + /* v * 2^-wp now equals gamma + log(n) with error at most 3*2^-wp */ + + /* log(n) < 2^ceil(log2(n)) */ + magn = MPFR_INT_CEIL_LOG2(n); + mpfr_set_prec (y, wp + magn); + mpfr_set_ui (y, n, MPFR_RNDZ); /* exact */ + mpfr_log (y, y, MPFR_RNDZ); /* error < 2^-wp */ + + mpfr_mul_2exp (y, y, wp, MPFR_RNDZ); + mpfr_z_sub (y, v, y, MPFR_RNDZ); + mpfr_div_2exp (y, y, wp, MPFR_RNDZ); + /* rounding error from the last subtraction < 2^-wp */ + /* so y = gamma with error < 5*2^-wp */ + + if (MPFR_LIKELY (MPFR_CAN_ROUND (y, wp - 3, prec, rnd))) + break; + + MPFR_ZIV_NEXT (loop, wp); } - /* the error on s is at most 1+2+...+n = n*(n+1)/2 */ - mpz_fdiv_q_ui (s, s, n); /* err <= 1 + (n+1)/2 */ - MPFR_ASSERTN (MPFR_PREC(x) >= mpz_sizeinbase(s, 2)); - mpfr_set_z (x, s, MPFR_RNDD); /* exact */ - mpfr_div_2ui (x, x, m, MPFR_RNDD); - /* now x = 1/n * sum(k!/(-n)^k, k=0..n-2) <= 1/n */ - /* err(x) <= (n+1)/2^m <= (n+1)*exp(n)/2^PREC(x) */ - - mpfr_init2 (y, m); - mpfr_set_si (y, -(long)n, MPFR_RNDD); /* assumed exact */ - mpfr_exp (y, y, MPFR_RNDD); /* err <= ulp(y) <= exp(-n)*2^(1-m) */ - mpfr_mul (x, x, y, MPFR_RNDD); - /* err <= ulp(x) + (n + 1 + 2/n) / 2^prec(x) - <= ulp(x) + (n + 1 + 2/n) ulp(x)/x since x*2^(-prec(x)) < ulp(x) - <= ulp(x) + (n + 1 + 2/n) 3/(2n) ulp(x) since x >= 2/3*n for n >= 2 - <= 4 * ulp(x) for n >= 2 */ + + MPFR_ZIV_FREE (loop); + inexact = mpfr_set (x, y, rnd); + mpfr_clear (y); + mpz_clear (t); + mpz_clear (u); + mpz_clear (v); + mpfr_const_euler_bs_clear (sum); - mpz_clear (a); - mpz_clear (s); + return inexact; /* always inexact */ } + |