diff options
author | zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> | 2016-05-25 18:39:24 +0000 |
---|---|---|
committer | zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> | 2016-05-25 18:39:24 +0000 |
commit | 85a4a5a204367155de17432baad47d7d17b7a1b0 (patch) | |
tree | 4edc4b13dc659939f434dac1494ff76eae1bc871 | |
parent | cf2ada9015c240d1bcb270209d0316f154e11cbd (diff) | |
download | mpfr-85a4a5a204367155de17432baad47d7d17b7a1b0.tar.gz |
improved mpfr_log_ui
git-svn-id: svn://scm.gforge.inria.fr/svn/mpfr/trunk@10374 280ebfd0-de03-0410-8827-d642c229c3f4
-rw-r--r-- | src/log_ui.c | 129 |
1 files changed, 80 insertions, 49 deletions
diff --git a/src/log_ui.c b/src/log_ui.c index f041bf44b..ca9f91d01 100644 --- a/src/log_ui.c +++ b/src/log_ui.c @@ -24,53 +24,62 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., #include "mpfr-impl.h" /* FIXME: mpfr_log_ui is much slower than mpfr_log on some values of n, - e.g. something like 8 times as slow for n around ULONG_MAX/3 on an - x86_64 Linux machine. */ - -/* Auxiliary function: Compute using binary splitting the sum - -sum((-x)^(n+1-n1)/n, n = n1..n2-1) where x = p/2^k, - -1/3 <= x <= 1/3. For n1=1 we get -sum((-x)^n/n, n = 1..n2-1). - Numerator is P[0], denominator is Q[0], - Need 1+ceil(log(n2-n1)/log(2)) cells in P[],Q[]. + e.g. about 4 times as slow for n around ULONG_MAX/3 on an + x86_64 Linux machine, for 10^6 bits of precision. The reason is that + for say n=6148914691236517205 and prec=10^6, the value of T computed + has more than 50M bits, which is much more than needed. Indeed the + binary splitting algorithm for series with a finite radius of convergence + gives rationals of size n*log(n) for a target precision n. One might + truncate the rationals inside the algorithm, but then the error analysis + should be redone. */ + +/* Cf http://www.ginac.de/CLN/binsplit.pdf: the Taylor series of log(1+x) + up to order N for x=p/2^k is T/(B*Q). + P[0] <- (-p)^(n2-n1) [with opposite sign when n1=1] + q <- k*(n2-n1) [corresponding to Q[0] = 2^q] + B[0] <- n1 * (n1+1) * ... * (n2-1) + T[0] <- B[0]*Q[0] * S(n1,n2) + where S(n1,n2) = -sum((-x)^(i-n1+1)/i, i=n1..n2-1) + Assumes p is odd or zero, and -1/3 <= x = p/2^k <= 1/3. */ static void -S (mpz_t *P, mpz_t *Q, unsigned long n1, unsigned long n2, - long p, unsigned long k) +S (mpz_t *P, unsigned long *q, mpz_t *B, mpz_t *T, unsigned long n1, + unsigned long n2, long p, unsigned long k, int need_P) { MPFR_ASSERTD (n1 < n2); if (n2 == n1 + 1) { - mpz_set_si (P[0], p); - mpz_set_ui (Q[0], n1); - mpz_mul_2exp (Q[0], Q[0], k); + mpz_set_si (P[0], (n1 == 1) ? p : -p); + *q = k; + mpz_set_ui (B[0], n1); + /* T = B*Q*S where S = P/(B*Q) thus T = P */ + mpz_set (T[0], P[0]); + /* since p is odd (or zero), there is no common factor 2 between + P and Q, or T and B */ } else { - unsigned long m = (n1 / 2) + (n2 / 2) + (n1 & 1UL & n2); + unsigned long m = (n1 / 2) + (n2 / 2) + (n1 & 1UL & n2), q1; /* m = floor((n1+n2)/2) */ MPFR_ASSERTD (n1 < m && m < n2); - S (P, Q, n1, m, p, k); - S (P + 1, Q + 1, m, n2, p, k); - /* P1/Q1 + (-p)^(m-n1)/2^(k(m-n1)) P2/Q2 = - (P1*2^(k(m-n1))*Q2 + Q1*(-p)^(m-n1)*P2)/(Q1*Q2*2^(k(m-n1))) - We know 2^(k(m-n1)) divides Q1, thus: - (P1*Q2 + (Q1/2^(k(m-n1)))*(-p)^(m-n1)*P2)/(Q1*Q2) */ - mpz_mul (P[0], P[0], Q[1]); /* P1*Q2 */ - mpz_mul (Q[1], Q[0], Q[1]); /* Q1*Q2 */ - mpz_tdiv_q_2exp (Q[0], Q[0], k * (m - n1)); /* Q1/2^(k(m-n1)) */ - mpz_mul (P[1], P[1], Q[0]); /* Q1*P2/2^(k(m-n1)) */ - mpz_swap (Q[0], Q[1]); /* Q1*Q2 */ - if (p > 0) - { - mpz_ui_pow_ui (Q[1], p, m - n1); /* p^m */ - if ((m - n1) & 1) - mpz_neg (Q[1], Q[1]); - } - else - mpz_ui_pow_ui (Q[1], -p, m - n1); /* (-p)^m */ - mpz_mul (P[1], P[1], Q[1]); /* Q1*Q2*2^(km) */ - mpz_add (P[0], P[0], P[1]); + S (P, q, B, T, n1, m, p, k, 1); + S (P + 1, &q1, B + 1, T + 1, m, n2, p, k, need_P); + + /* T0 <- T0*B1*Q1 + P0*B0*T1 */ + mpz_mul (T[1], T[1], P[0]); + mpz_mul (T[1], T[1], B[0]); + mpz_mul (T[0], T[0], B[1]); + /* Q[1] = 2^q1 */ + mpz_mul_2exp (T[0], T[0], q1); /* mpz_mul (T[0], T[0], Q[1]) */ + mpz_add (T[0], T[0], T[1]); + if (need_P) + mpz_mul (P[0], P[0], P[1]); + *q += q1; /* mpz_mul (Q[0], Q[0], Q[1]) */ + mpz_mul (B[0], B[0], B[1]); + + /* there should be no common factors 2 between P, Q and T, + since P is odd (or zero) */ } } @@ -79,7 +88,7 @@ mpfr_log_ui (mpfr_ptr x, unsigned long n, mpfr_rnd_t rnd_mode) { unsigned long k; mpfr_prec_t w; /* working precision */ - mpz_t three_n, *P, *Q; + mpz_t three_n, *P, *B, *T; mpfr_t t, q; int inexact; unsigned long N, lgN, i; @@ -132,16 +141,25 @@ mpfr_log_ui (mpfr_ptr x, unsigned long n, mpfr_rnd_t rnd_mode) MPFR_GROUP_INIT_2(group, w, t, q); MPFR_SAVE_EXPO_MARK (expo); + unsigned long kk = k; + if (p != 0) + while ((p % 2) == 0) /* replace p/2^kk by (p/2)/2^(kk-1) */ + { + p /= 2; + kk --; + } + MPFR_ZIV_INIT (loop, w); for (;;) { mpfr_t tmp; unsigned int err; - /* we need at most w/log2(2^k/|p|) terms for an accuracy of w bits */ + + /* we need at most w/log2(2^kk/|p|) terms for an accuracy of w bits */ mpfr_init2 (tmp, 32); mpfr_set_ui (tmp, (p > 0) ? p : -p, MPFR_RNDU); mpfr_log2 (tmp, tmp, MPFR_RNDU); - mpfr_ui_sub (tmp, k, tmp, MPFR_RNDD); + mpfr_ui_sub (tmp, kk, tmp, MPFR_RNDD); MPFR_ASSERTN (w <= ULONG_MAX); mpfr_ui_div (tmp, w, tmp, MPFR_RNDU); N = mpfr_get_ui (tmp, MPFR_RNDU); @@ -149,17 +167,25 @@ mpfr_log_ui (mpfr_ptr x, unsigned long n, mpfr_rnd_t rnd_mode) N = 2; lgN = MPFR_INT_CEIL_LOG2 (N) + 1; mpfr_clear (tmp); - P = (mpz_t *) MPFR_TMP_ALLOC (2 * lgN * sizeof (mpz_t)); - Q = P + lgN; + P = (mpz_t *) MPFR_TMP_ALLOC (3 * lgN * sizeof (mpz_t)); + B = P + lgN; + T = B + lgN; for (i = 0; i < lgN; i++) { mpz_init (P[i]); - mpz_init (Q[i]); + mpz_init (B[i]); + mpz_init (T[i]); } - S (P, Q, 1, N, p, k); - mpfr_set_z (t, P[0], MPFR_RNDN); /* t = P[0] * (1 + theta_1) */ - mpfr_set_z (q, Q[0], MPFR_RNDN); /* q = Q[0] * (1 + theta_2) */ - mpfr_div (t, t, q, MPFR_RNDN); /* t = P[0]/Q[0] * (1 + theta_3)^3 + + unsigned long q0; + S (P, &q0, B, T, 1, N, p, kk, 0); + // mpz_mul (Q[0], B[0], Q[0]); + // mpz_mul_2exp (B[0], B[0], q0); + + mpfr_set_z (t, T[0], MPFR_RNDN); /* t = P[0] * (1 + theta_1) */ + mpfr_set_z (q, B[0], MPFR_RNDN); /* q = B[0] * (1 + theta_2) */ + mpfr_mul_2exp (q, q, q0, MPFR_RNDN); /* B[0]*Q[0] */ + mpfr_div (t, t, q, MPFR_RNDN); /* t = T[0]/(B[0]*Q[0])*(1 + theta_3)^3 = log(n/2^k) * (1 + theta_4)^4 for |theta_i| < 2^(-w) */ @@ -170,12 +196,17 @@ mpfr_log_ui (mpfr_ptr x, unsigned long n, mpfr_rnd_t rnd_mode) for (i = 0; i < lgN; i++) { mpz_clear (P[i]); - mpz_clear (Q[i]); + mpz_clear (B[i]); + mpz_clear (T[i]); } - /* the maximal error is 5 ulps for P/Q, since |(1+/-u)^4 - 1| < 5*u + /* The maximal error is 5 ulps for P/Q, since |(1+/-u)^4 - 1| < 5*u for u < 2^(-12), k ulps for k*log(2), and 1 ulp for the addition, - thus at most k+6 ulps */ - err = MPFR_INT_CEIL_LOG2 (k + 6); + thus at most k+6 ulps. + Note that there might be some cancellation in the addition: the worst + case is when log(1 + p/2^kk) = log(2/3) ~ -0.405, and with n=3 which + gives k=2, thus we add 2*log(2) = 1.386. Thus in the worst case we + have an exponent decrease of 1, which accounts for +1 in the error. */ + err = MPFR_INT_CEIL_LOG2 (k + 6) + 1; if (MPFR_LIKELY (MPFR_CAN_ROUND (t, w - err, MPFR_PREC(x), rnd_mode))) break; |