2 files changed, 199 insertions, 173 deletions
diff --git a/NEWS b/NEWS
index 217ea0115..0259a0572 100644
--- a/NEWS
+++ b/NEWS
@@ -46,6 +46,7 @@ Changes from versions 3.1.* to version 3.2.0:
 - Dropped K&R C compatibility.
 - Improved MPFR manual.
 - New MPFRbench program (see the tools/bench directory).
+- Speedup in the mpfr_const_euler function (contributed by Fredrik Johansson).
 - Bug fixes. In particular, a speed improvement when the --enable-assert
   or --enable-assert=full configure option is used with GCC.
 
diff --git a/src/const_euler.c b/src/const_euler.c
index db033d80b..6e3c3cd37 100644
--- a/src/const_euler.c
+++ b/src/const_euler.c
@@ -1,7 +1,7 @@
 /* mpfr_const_euler -- Euler's constant
 
 Copyright 2001-2014 Free Software Foundation, Inc.
-Contributed by the AriC and Caramel projects, INRIA.
+Contributed by Fredrik Johansson.
 
 This file is part of the GNU MPFR Library.
 
@@ -20,6 +20,9 @@ along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
 
+/* The approximation error bound uses Theorem 1 and Remark 2 in
+   http://arxiv.org/pdf/1312.0039v1.pdf */
+
 #define MPFR_NEED_LONGLONG_H
 #include "mpfr-impl.h"
 
@@ -34,219 +37,241 @@ mpfr_const_euler (mpfr_ptr x, mpfr_rnd_t rnd_mode) {
 }
 
 
-static void mpfr_const_euler_S2 (mpfr_ptr, mpfr_t);
-static void mpfr_const_euler_R (mpfr_ptr, unsigned long);
-
-int
-mpfr_const_euler_internal (mpfr_t x, mpfr_rnd_t rnd)
+typedef struct
 {
-  mpfr_prec_t prec = MPFR_PREC(x), m, log2m;
-  mpfr_t y, z, l2;
-  unsigned long n;
-  int inexact;
-  MPFR_ZIV_DECL (loop);
+  mpz_t P;
+  mpz_t Q;
+  mpz_t T;
+  mpz_t C;
+  mpz_t D;
+  mpz_t V;
+} mpfr_const_euler_bs_struct;
 
-  MPFR_LOG_FUNC (
-    ("rnd=%d", rnd),
-    ("x[%Pu]=%.*Rg inex=%d", mpfr_get_prec(x), mpfr_log_prec, x, inexact));
+typedef mpfr_const_euler_bs_struct mpfr_const_euler_bs_t[1];
 
-  log2m = MPFR_INT_CEIL_LOG2 (prec);
-  m = MPFR_ADD_PREC (prec, 2 * log2m + 23);
-
-  mpfr_init2 (y, m);
-  mpfr_init2 (z, m);
+static void
+mpfr_const_euler_bs_init (mpfr_const_euler_bs_t s)
+{
+  mpz_init (s->P);
+  mpz_init (s->Q);
+  mpz_init (s->T);
+  mpz_init (s->C);
+  mpz_init (s->D);
+  mpz_init (s->V);
+}
 
-  mpfr_init2 (l2, sizeof (mpfr_prec_t) * CHAR_BIT);
-  mpfr_const_log2 (l2, MPFR_RNDU);
-  mpfr_div_2ui (l2, l2, 1, MPFR_RNDU);  /* log(2)/2, rounded upward */
+static void
+mpfr_const_euler_bs_clear (mpfr_const_euler_bs_t s)
+{
+  mpz_clear (s->P);
+  mpz_clear (s->Q);
+  mpz_clear (s->T);
+  mpz_clear (s->C);
+  mpz_clear (s->D);
+  mpz_clear (s->V);
+}
 
-  MPFR_ZIV_INIT (loop, m);
-  for (;;)
+static void
+mpfr_const_euler_bs_1 (mpfr_const_euler_bs_t s, 
+                       unsigned long n1, unsigned long n2, unsigned long N,
+                       int cont)
+{
+  if (n2 - n1 == 1)
     {
-      mpfr_exp_t exp_S, err;
-      MPFR_BLOCK_DECL (flags);
-
-      /* since prec >= 1, we have m >= 24 here, which ensures n >= 9 below */
-      /* Compute n >= prec * log(2)/2 (see algorithms.tex). */
-      mpfr_set_exp_t (z, m, MPFR_RNDU);
-      mpfr_mul (z, z, l2, MPFR_RNDU);
-      mpfr_ceil (z, z);
-      MPFR_LOG_VAR (z);
-      MPFR_BLOCK (flags, n = mpfr_get_ui (z, MPFR_RNDN));
-      MPFR_ASSERTD (n >= 9 && ! MPFR_ERANGEFLAG (flags));
-      mpfr_const_euler_S2 (y, z); /* error <= 3 ulps */
-      exp_S = MPFR_EXP(y);
-      mpfr_log (z, z, MPFR_RNDD); /* error <= 1 ulp */
-      mpfr_sub (y, y, z, MPFR_RNDN); /* S'(n) - log(n) */
-      /* the error is less than 1/2 + 3*2^(exp_S-EXP(y)) + 2^(EXP(z)-EXP(y))
-         <= 1/2 + 2^(exp_S+2-EXP(y)) + 2^(EXP(z)-EXP(y))
-         <= 1/2 + 2^(1+MAX(exp_S+2,EXP(z))-EXP(y)) */
-      err = 1 + MAX(exp_S + 2, MPFR_EXP(z)) - MPFR_EXP(y);
-      err = (err >= -1) ? err + 1 : 0; /* error <= 2^err ulp(y) */
-      exp_S = MPFR_EXP(y);
-      mpfr_const_euler_R (z, n); /* err <= ulp(1/2) = 2^(-m) */
-      mpfr_sub (y, y, z, MPFR_RNDN);
-      /* err <= 1/2 ulp(y) + 2^(-m) + 2^(err + exp_S - EXP(y)) ulp(y).
-         Since the result is between 0.5 and 1, ulp(y) = 2^(-m).
-         So we get 3/2*ulp(y) + 2^(err + exp_S - EXP(y)) ulp(y).
-         3/2 + 2^e <= 2^(e+1) for e>=1, and <= 2^2 otherwise */
-      err = err + exp_S - MPFR_EXP(y);
-      err = (err >= 1) ? err + 1 : 2;
-      if (MPFR_LIKELY (MPFR_CAN_ROUND (y, m - err, prec, rnd)))
-        break;
-      MPFR_ZIV_NEXT (loop, m);
-      mpfr_set_prec (y, m);
-      mpfr_set_prec (z, m);
+      mpz_set_ui (s->P, N);
+      mpz_mul (s->P, s->P, s->P);
+      mpz_set_ui (s->Q, n1 + 1);
+      mpz_mul (s->Q, s->Q, s->Q);
+      mpz_set_ui (s->C, 1);
+      mpz_set_ui (s->D, n1 + 1);
+      mpz_set (s->T, s->P);
+      mpz_set (s->V, s->P);
     }
-  MPFR_ZIV_FREE (loop);
+  else
+    {
+      mpfr_const_euler_bs_t L, R;
+      mpz_t t, u, v;
+      unsigned long m = (n1 + n2) / 2;
 
-  inexact = mpfr_set (x, y, rnd);
+      mpfr_const_euler_bs_init (L);
+      mpfr_const_euler_bs_init (R);
+      mpfr_const_euler_bs_1 (L, n1, m, N, 1);
+      mpfr_const_euler_bs_1 (R, m, n2, N, 1);
 
-  mpfr_clear (y);
-  mpfr_clear (z);
-  mpfr_clear (l2);
+      mpz_init (t);
+      mpz_init (u);
+      mpz_init (v);
 
-  return inexact; /* always inexact */
+      if (cont)
+        mpz_mul (s->P, L->P, R->P);
+
+      mpz_mul (s->Q, L->Q, R->Q);
+      mpz_mul (s->D, L->D, R->D);
+
+      /* T = LP RT + RQ LT*/
+      mpz_mul (t, L->P, R->T);
+      mpz_mul (v, R->Q, L->T);
+      mpz_add (s->T, t, v);
+
+      /* C = LC RD + RC LD */
+      if (cont)
+        {
+          mpz_mul (s->C, L->C, R->D);
+          mpz_addmul (s->C, R->C, L->D);
+        }
+
+      /* V = RD (RQ LV + LC LP RT) + LD LP RV */
+      mpz_mul (u, L->P, R->V);
+      mpz_mul (u, u, L->D);
+      mpz_mul (v, R->Q, L->V);
+      mpz_addmul (v, t, L->C);
+      mpz_mul (v, v, R->D);
+      mpz_add (s->V, u, v);
+
+      mpfr_const_euler_bs_clear (L);
+      mpfr_const_euler_bs_clear (R);
+      mpz_clear (t);
+      mpz_clear (u);
+      mpz_clear (v);
+  }
 }
 
 static void
-mpfr_const_euler_S2_aux (mpz_t P, mpz_t Q, mpz_t T, mpfr_t n,
-                         unsigned long a, unsigned long b, int need_P)
+mpfr_const_euler_bs_2 (mpz_t P, mpz_t Q, mpz_t T,
+                       unsigned long n1, unsigned long n2, unsigned long N,
+                       int cont)
 {
-  if (a + 1 == b)
+  if (n2 - n1 == 1)
     {
-      int inex;
-      MPFR_BLOCK_DECL (flags);
-
-      MPFR_BLOCK (flags, MPFR_DBGRES (inex = mpfr_get_z (P, n, MPFR_RNDN)));
-      MPFR_ASSERTD (inex == 0 && ! MPFR_ERANGEFLAG (flags));
-      if (a > 1)
-        mpz_mul_si (P, P, 1 - (long) a);
+      if (n1 == 0)
+        {
+          mpz_set_ui (P, 1);
+          mpz_set_ui (Q, 4 * N);
+        }
+      else
+        {
+          mpz_set_ui (P, 2 * n1 - 1);
+          mpz_pow_ui (P, P, 3);
+          mpz_set_ui (Q, 32 * n1);
+          mpz_mul_ui (Q, Q, N);
+          mpz_mul_ui (Q, Q, N);
+        }
       mpz_set (T, P);
-      mpz_set_ui (Q, a);
-      mpz_mul_ui (Q, Q, a);
     }
   else
     {
-      unsigned long c = (a + b) / 2;
       mpz_t P2, Q2, T2;
-      mpfr_const_euler_S2_aux (P, Q, T, n, a, c, 1);
+      unsigned long m = (n1 + n2) / 2;
+
       mpz_init (P2);
       mpz_init (Q2);
       mpz_init (T2);
-      mpfr_const_euler_S2_aux (P2, Q2, T2, n, c, b, 1);
+      mpfr_const_euler_bs_2 (P, Q, T, n1, m, N, 1);
+      mpfr_const_euler_bs_2 (P2, Q2, T2, m, n2, N, 1);
       mpz_mul (T, T, Q2);
       mpz_mul (T2, T2, P);
       mpz_add (T, T, T2);
-      if (need_P)
+      if (cont)
         mpz_mul (P, P, P2);
       mpz_mul (Q, Q, Q2);
       mpz_clear (P2);
       mpz_clear (Q2);
       mpz_clear (T2);
-      /* divide by 2 if possible */
-      {
-        unsigned long v2;
-        v2 = mpz_scan1 (P, 0);
-        c = mpz_scan1 (Q, 0);
-        if (c < v2)
-          v2 = c;
-        c = mpz_scan1 (T, 0);
-        if (c < v2)
-          v2 = c;
-        if (v2)
-          {
-            mpz_tdiv_q_2exp (P, P, v2);
-            mpz_tdiv_q_2exp (Q, Q, v2);
-            mpz_tdiv_q_2exp (T, T, v2);
-          }
-      }
     }
 }
 
-/* computes S(n) = sum(n^k*(-1)^(k-1)/k!/k, k=1..ceil(4.319136566 * n))
-   using binary splitting.
-   We have S(n) = sum(f(k), k=1..N) with N=ceil(4.319136566 * n)
-   and f(k) = n^k*(-1)*(k-1)/k!/k,
-   thus f(k)/f(k-1) = -n*(k-1)/k^2
-*/
-static void
-mpfr_const_euler_S2 (mpfr_t x, mpfr_t n)
-{
-  mpz_t P, Q, T;
-  unsigned long N;
-  mpfr_t NN;
-  MPFR_BLOCK_DECL (flags);
-
-  mpfr_init2 (NN, 64);
-  mpfr_set_str_binary (NN, /* a+2 = a*log(a), rounded to +infinity */
-    "100.01010001101100101110111100011011001011011111001111001111");
-  mpfr_mul (NN, NN, n, MPFR_RNDU);
-  MPFR_LOG_VAR (NN);
-  /* N = ceil(a.n) upper bound */
-  MPFR_BLOCK (flags, N = mpfr_get_ui (NN, MPFR_RNDU));
-  MPFR_ASSERTD (! MPFR_ERANGEFLAG (flags));
-  mpz_init (P);
-  mpz_init (Q);
-  mpz_init (T);
-  mpfr_const_euler_S2_aux (P, Q, T, n, 1, N + 1, 0);
-  mpfr_set_z (x, T, MPFR_RNDN);
-  mpfr_div_z (x, x, Q, MPFR_RNDN);
-  mpz_clear (P);
-  mpz_clear (Q);
-  mpz_clear (T);
-  mpfr_clear (NN);
-}
-
-/* computes R(n) = exp(-n)/n * sum(k!/(-n)^k, k=0..n-2)
-   with error at most 4*ulp(x). Assumes n>=2.
-   Since x <= exp(-n)/n <= 1/8, then 4*ulp(x) <= ulp(1).
-*/
-static void
-mpfr_const_euler_R (mpfr_t x, unsigned long n)
+int
+mpfr_const_euler_internal (mpfr_t x, mpfr_rnd_t rnd)
 {
-  unsigned long k, m;
-  mpz_t a, s;
+  mpfr_const_euler_bs_t sum;
+  mpz_t t, u, v;
+  unsigned long n, N;
+  mpfr_prec_t prec, wp, magn;
   mpfr_t y;
+  int inexact;
+  MPFR_ZIV_DECL (loop);
 
-  MPFR_ASSERTN (n >= 2); /* ensures sum(k!/(-n)^k, k=0..n-2) >= 2/3 */
-
-  /* as we multiply the sum by exp(-n), we need only PREC(x) - n/LOG2 bits */
-  m = MPFR_PREC(x) - (unsigned long) ((double) n / LOG2);
+  prec = mpfr_get_prec (x);
+  wp = prec + 24;
 
-  mpz_init_set_ui (a, 1);
-  mpz_mul_2exp (a, a, m);
-  mpz_init_set (s, a);
+  mpfr_init2 (y, wp);
+  mpfr_const_euler_bs_init (sum);
+  mpz_init (t);
+  mpz_init (u);
+  mpz_init (v);
 
-  for (k = 1; k <= n; k++)
+  MPFR_ZIV_INIT (loop, wp);
+  for (;;)
     {
-      mpz_mul_ui (a, a, k);
-      mpz_fdiv_q_ui (a, a, n);
-      /* the error e(k) on a is e(k) <= 1 + k/n*e(k-1) with e(0)=0,
-         i.e. e(k) <= k */
-      if (k % 2)
-        mpz_sub (s, s, a);
-      else
-        mpz_add (s, s, a);
+      /* The approximation error is bounded by 24 exp(-8n) when
+         n > 1, which is smaller than 2^-wp if
+         n > (wp + log_2(24)) * (log(2)/8).
+         Note log2(24) < 5 and log(2)/8 < 866434 / 10000000. */
+      mpz_set_ui (t, wp + 5);
+      mpz_mul_ui (t, t, 866434);
+      mpz_cdiv_q_ui (t, t, 10000000);
+      n = mpz_get_ui (t);
+
+      /* It is sufficient to take N >= alpha*n + 1
+         where alpha = 3/LambertW(3/e) = 4.970625759544... */
+      mpz_set_ui (t, n);
+      mpz_mul_ui (t, t, 4970626);
+      mpz_cdiv_q_ui (t, t, 1000000);
+      mpz_add_ui (t, t, 1);
+      N = mpz_get_ui (t);
+
+      /* V / ((T + Q) * D) = S / I
+         where S = sum_{k=0}^{N-1} H_k n^(2k) / (k!)^2,
+               I = sum_{k=0}^{N-1} n^(2k) / (k!)^2 */
+      mpfr_const_euler_bs_1 (sum, 0, N, n, 0);
+      mpz_add (sum->T, sum->T, sum->Q);
+      mpz_mul (t, sum->T, sum->D);
+      mpz_mul_2exp (u, sum->V, wp);
+      mpz_tdiv_q (v, u, t);
+      /* v * 2^-wp = S/I with error < 1 */
+
+      /* C / (D * V) = U where
+         U = (1/(4n)) sum_{k=0}^{2n-1} [(2k)!]^3 / ((k!)^4 8^(2k) (2n)^(2k)) */
+      mpfr_const_euler_bs_2 (sum->C, sum->D, sum->V, 0, 2*n, n, 0);
+      mpz_mul (t, sum->Q, sum->Q);
+      mpz_mul (t, t, sum->V);
+      mpz_mul (u, sum->T, sum->T);
+      mpz_mul (u, u, sum->D);
+      mpz_mul_2exp (t, t, wp);
+      mpz_div (t, t, u);
+      /* t * 2^-wp = U/I^2 with error < 1 */
+
+      /* gamma = S/I - U/I^2 - log(n) with error at most 2^-wp */
+      mpz_sub (v, v, t);
+      /* v * 2^-wp now equals gamma + log(n) with error at most 3*2^-wp */
+
+      /* log(n) < 2^ceil(log2(n)) */
+      magn = MPFR_INT_CEIL_LOG2(n);
+      mpfr_set_prec (y, wp + magn);
+      mpfr_set_ui (y, n, MPFR_RNDZ); /* exact */
+      mpfr_log (y, y, MPFR_RNDZ); /* error < 2^-wp */
+
+      mpfr_mul_2exp (y, y, wp, MPFR_RNDZ);
+      mpfr_z_sub (y, v, y, MPFR_RNDZ);
+      mpfr_div_2exp (y, y, wp, MPFR_RNDZ);
+      /* rounding error from the last subtraction < 2^-wp */
+      /* so y = gamma with error < 5*2^-wp */
+
+      if (MPFR_LIKELY (MPFR_CAN_ROUND (y, wp - 3, prec, rnd)))
+        break;
+
+      MPFR_ZIV_NEXT (loop, wp);
     }
-  /* the error on s is at most 1+2+...+n = n*(n+1)/2 */
-  mpz_fdiv_q_ui (s, s, n); /* err <= 1 + (n+1)/2 */
-  MPFR_ASSERTN (MPFR_PREC(x) >= mpz_sizeinbase(s, 2));
-  mpfr_set_z (x, s, MPFR_RNDD); /* exact */
-  mpfr_div_2ui (x, x, m, MPFR_RNDD);
-  /* now x = 1/n * sum(k!/(-n)^k, k=0..n-2) <= 1/n */
-  /* err(x) <= (n+1)/2^m <= (n+1)*exp(n)/2^PREC(x) */
-
-  mpfr_init2 (y, m);
-  mpfr_set_si (y, -(long)n, MPFR_RNDD); /* assumed exact */
-  mpfr_exp (y, y, MPFR_RNDD); /* err <= ulp(y) <= exp(-n)*2^(1-m) */
-  mpfr_mul (x, x, y, MPFR_RNDD);
-  /* err <= ulp(x) + (n + 1 + 2/n) / 2^prec(x)
-     <= ulp(x) + (n + 1 + 2/n) ulp(x)/x since x*2^(-prec(x)) < ulp(x)
-     <= ulp(x) + (n + 1 + 2/n) 3/(2n) ulp(x) since x >= 2/3*n for n >= 2
-     <= 4 * ulp(x) for n >= 2 */
+  
+  MPFR_ZIV_FREE (loop);
+  inexact = mpfr_set (x, y, rnd);
+
   mpfr_clear (y);
+  mpz_clear (t);
+  mpz_clear (u);
+  mpz_clear (v);
+  mpfr_const_euler_bs_clear (sum);
 
-  mpz_clear (a);
-  mpz_clear (s);
+  return inexact; /* always inexact */
 }
+