improved mpfr_log_ui

git-svn-id: svn://scm.gforge.inria.fr/svn/mpfr/trunk@10374 280ebfd0-de03-0410-8827-d642c229c3f4
author: zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> 2016-05-25 18:39:24 +0000
committer: zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4> 2016-05-25 18:39:24 +0000
commit: 85a4a5a204367155de17432baad47d7d17b7a1b0 (patch)
tree: 4edc4b13dc659939f434dac1494ff76eae1bc871
parent: cf2ada9015c240d1bcb270209d0316f154e11cbd (diff)
download: mpfr-85a4a5a204367155de17432baad47d7d17b7a1b0.tar.gz
1 files changed, 80 insertions, 49 deletions
diff --git a/src/log_ui.c b/src/log_ui.c
index f041bf44b..ca9f91d01 100644
--- a/src/log_ui.c
+++ b/src/log_ui.c
@@ -24,53 +24,62 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
 #include "mpfr-impl.h"
 
 /* FIXME: mpfr_log_ui is much slower than mpfr_log on some values of n,
-   e.g. something like 8 times as slow for n around ULONG_MAX/3 on an
-   x86_64 Linux machine. */
-
-/* Auxiliary function: Compute using binary splitting the sum
-   -sum((-x)^(n+1-n1)/n, n = n1..n2-1) where x = p/2^k,
-   -1/3 <= x <= 1/3. For n1=1 we get -sum((-x)^n/n, n = 1..n2-1).
-   Numerator is P[0], denominator is Q[0],
-   Need 1+ceil(log(n2-n1)/log(2)) cells in P[],Q[].
+   e.g. about 4 times as slow for n around ULONG_MAX/3 on an
+   x86_64 Linux machine, for 10^6 bits of precision. The reason is that
+   for say n=6148914691236517205 and prec=10^6, the value of T computed
+   has more than 50M bits, which is much more than needed. Indeed the
+   binary splitting algorithm for series with a finite radius of convergence
+   gives rationals of size n*log(n) for a target precision n. One might
+   truncate the rationals inside the algorithm, but then the error analysis
+   should be redone. */
+
+/* Cf http://www.ginac.de/CLN/binsplit.pdf: the Taylor series of log(1+x)
+   up to order N for x=p/2^k is T/(B*Q).
+   P[0] <- (-p)^(n2-n1) [with opposite sign when n1=1]
+   q <- k*(n2-n1) [corresponding to Q[0] = 2^q]
+   B[0] <- n1 * (n1+1) * ... * (n2-1)
+   T[0] <- B[0]*Q[0] * S(n1,n2)
+   where S(n1,n2) = -sum((-x)^(i-n1+1)/i, i=n1..n2-1)
+   Assumes p is odd or zero, and -1/3 <= x = p/2^k <= 1/3.
 */
 static void
-S (mpz_t *P, mpz_t *Q, unsigned long n1, unsigned long n2,
-   long p, unsigned long k)
+S (mpz_t *P, unsigned long *q, mpz_t *B, mpz_t *T, unsigned long n1,
+   unsigned long n2, long p, unsigned long k, int need_P)
 {
   MPFR_ASSERTD (n1 < n2);
   if (n2 == n1 + 1)
     {
-      mpz_set_si (P[0], p);
-      mpz_set_ui (Q[0], n1);
-      mpz_mul_2exp (Q[0], Q[0], k);
+      mpz_set_si (P[0], (n1 == 1) ? p : -p);
+      *q = k;
+      mpz_set_ui (B[0], n1);
+      /* T = B*Q*S where S = P/(B*Q) thus T = P */
+      mpz_set (T[0], P[0]);
+      /* since p is odd (or zero), there is no common factor 2 between
+         P and Q, or T and B */
     }
   else
     {
-      unsigned long m = (n1 / 2) + (n2 / 2) + (n1 & 1UL & n2);
+      unsigned long m = (n1 / 2) + (n2 / 2) + (n1 & 1UL & n2), q1;
       /* m = floor((n1+n2)/2) */
 
       MPFR_ASSERTD (n1 < m && m < n2);
-      S (P, Q, n1, m, p, k);
-      S (P + 1, Q + 1, m, n2, p, k);
-      /* P1/Q1 + (-p)^(m-n1)/2^(k(m-n1)) P2/Q2 =
-         (P1*2^(k(m-n1))*Q2 + Q1*(-p)^(m-n1)*P2)/(Q1*Q2*2^(k(m-n1)))
-         We know 2^(k(m-n1)) divides Q1, thus:
-         (P1*Q2 + (Q1/2^(k(m-n1)))*(-p)^(m-n1)*P2)/(Q1*Q2) */
-      mpz_mul (P[0], P[0], Q[1]);       /* P1*Q2 */
-      mpz_mul (Q[1], Q[0], Q[1]);       /* Q1*Q2 */
-      mpz_tdiv_q_2exp (Q[0], Q[0], k * (m - n1)); /* Q1/2^(k(m-n1)) */
-      mpz_mul (P[1], P[1], Q[0]);       /* Q1*P2/2^(k(m-n1)) */
-      mpz_swap (Q[0], Q[1]);            /* Q1*Q2 */
-      if (p > 0)
-        {
-          mpz_ui_pow_ui (Q[1], p, m - n1);       /* p^m */
-          if ((m - n1) & 1)
-            mpz_neg (Q[1], Q[1]);
-        }
-      else
-        mpz_ui_pow_ui (Q[1], -p, m - n1);       /* (-p)^m */
-      mpz_mul (P[1], P[1], Q[1]);       /* Q1*Q2*2^(km) */
-      mpz_add (P[0], P[0], P[1]);
+      S (P, q, B, T, n1, m, p, k, 1);
+      S (P + 1, &q1, B + 1, T + 1, m, n2, p, k, need_P);
+
+      /* T0 <- T0*B1*Q1 + P0*B0*T1 */
+      mpz_mul (T[1], T[1], P[0]);
+      mpz_mul (T[1], T[1], B[0]);
+      mpz_mul (T[0], T[0], B[1]);
+      /* Q[1] = 2^q1 */
+      mpz_mul_2exp (T[0], T[0], q1); /* mpz_mul (T[0], T[0], Q[1]) */
+      mpz_add (T[0], T[0], T[1]);
+      if (need_P)
+        mpz_mul (P[0], P[0], P[1]);
+      *q += q1; /* mpz_mul (Q[0], Q[0], Q[1]) */
+      mpz_mul (B[0], B[0], B[1]);
+
+      /* there should be no common factors 2 between P, Q and T,
+         since P is odd (or zero) */
     }
 }
 
@@ -79,7 +88,7 @@ mpfr_log_ui (mpfr_ptr x, unsigned long n, mpfr_rnd_t rnd_mode)
 {
   unsigned long k;
   mpfr_prec_t w; /* working precision */
-  mpz_t three_n, *P, *Q;
+  mpz_t three_n, *P, *B, *T;
   mpfr_t t, q;
   int inexact;
   unsigned long N, lgN, i;
@@ -132,16 +141,25 @@ mpfr_log_ui (mpfr_ptr x, unsigned long n, mpfr_rnd_t rnd_mode)
   MPFR_GROUP_INIT_2(group, w, t, q);
   MPFR_SAVE_EXPO_MARK (expo);
 
+  unsigned long kk = k;
+  if (p != 0)
+    while ((p % 2) == 0) /* replace p/2^kk by (p/2)/2^(kk-1) */
+      {
+        p /= 2;
+        kk --;
+      }
+
   MPFR_ZIV_INIT (loop, w);
   for (;;)
     {
       mpfr_t tmp;
       unsigned int err;
-      /* we need at most w/log2(2^k/|p|) terms for an accuracy of w bits */
+
+      /* we need at most w/log2(2^kk/|p|) terms for an accuracy of w bits */
       mpfr_init2 (tmp, 32);
       mpfr_set_ui (tmp, (p > 0) ? p : -p, MPFR_RNDU);
       mpfr_log2 (tmp, tmp, MPFR_RNDU);
-      mpfr_ui_sub (tmp, k, tmp, MPFR_RNDD);
+      mpfr_ui_sub (tmp, kk, tmp, MPFR_RNDD);
       MPFR_ASSERTN (w <= ULONG_MAX);
       mpfr_ui_div (tmp, w, tmp, MPFR_RNDU);
       N = mpfr_get_ui (tmp, MPFR_RNDU);
@@ -149,17 +167,25 @@ mpfr_log_ui (mpfr_ptr x, unsigned long n, mpfr_rnd_t rnd_mode)
         N = 2;
       lgN = MPFR_INT_CEIL_LOG2 (N) + 1;
       mpfr_clear (tmp);
-      P = (mpz_t *) MPFR_TMP_ALLOC (2 * lgN * sizeof (mpz_t));
-      Q = P + lgN;
+      P = (mpz_t *) MPFR_TMP_ALLOC (3 * lgN * sizeof (mpz_t));
+      B = P + lgN;
+      T = B + lgN;
       for (i = 0; i < lgN; i++)
         {
           mpz_init (P[i]);
-          mpz_init (Q[i]);
+          mpz_init (B[i]);
+          mpz_init (T[i]);
         }
-      S (P, Q, 1, N, p, k);
-      mpfr_set_z (t, P[0], MPFR_RNDN); /* t = P[0] * (1 + theta_1) */
-      mpfr_set_z (q, Q[0], MPFR_RNDN); /* q = Q[0] * (1 + theta_2) */
-      mpfr_div (t, t, q, MPFR_RNDN);   /* t = P[0]/Q[0] * (1 + theta_3)^3
+
+      unsigned long q0;
+      S (P, &q0, B, T, 1, N, p, kk, 0);
+      // mpz_mul (Q[0], B[0], Q[0]);
+      // mpz_mul_2exp (B[0], B[0], q0);
+
+      mpfr_set_z (t, T[0], MPFR_RNDN); /* t = P[0] * (1 + theta_1) */
+      mpfr_set_z (q, B[0], MPFR_RNDN); /* q = B[0] * (1 + theta_2) */
+      mpfr_mul_2exp (q, q, q0, MPFR_RNDN); /* B[0]*Q[0] */
+      mpfr_div (t, t, q, MPFR_RNDN);   /* t = T[0]/(B[0]*Q[0])*(1 + theta_3)^3
                                             = log(n/2^k) * (1 + theta_4)^4
                                             for |theta_i| < 2^(-w) */
 
@@ -170,12 +196,17 @@ mpfr_log_ui (mpfr_ptr x, unsigned long n, mpfr_rnd_t rnd_mode)
       for (i = 0; i < lgN; i++)
         {
           mpz_clear (P[i]);
-          mpz_clear (Q[i]);
+          mpz_clear (B[i]);
+          mpz_clear (T[i]);
         }
-      /* the maximal error is 5 ulps for P/Q, since |(1+/-u)^4 - 1| < 5*u
+      /* The maximal error is 5 ulps for P/Q, since |(1+/-u)^4 - 1| < 5*u
          for u < 2^(-12), k ulps for k*log(2), and 1 ulp for the addition,
-         thus at most k+6 ulps */
-      err = MPFR_INT_CEIL_LOG2 (k + 6);
+         thus at most k+6 ulps.
+         Note that there might be some cancellation in the addition: the worst
+         case is when log(1 + p/2^kk) = log(2/3) ~ -0.405, and with n=3 which
+         gives k=2, thus we add 2*log(2) = 1.386. Thus in the worst case we
+         have an exponent decrease of 1, which accounts for +1 in the error. */
+      err = MPFR_INT_CEIL_LOG2 (k + 6) + 1;
       if (MPFR_LIKELY (MPFR_CAN_ROUND (t, w - err, MPFR_PREC(x), rnd_mode)))
         break;
author	zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4>	2016-05-25 18:39:24 +0000
committer	zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4>	2016-05-25 18:39:24 +0000
commit	85a4a5a204367155de17432baad47d7d17b7a1b0 (patch)
tree	4edc4b13dc659939f434dac1494ff76eae1bc871
parent	cf2ada9015c240d1bcb270209d0316f154e11cbd (diff)
download	mpfr-85a4a5a204367155de17432baad47d7d17b7a1b0.tar.gz