5 files changed, 123 insertions, 28 deletions
diff --git a/configure.ac b/configure.ac
index b6b0500a7..662361849 100644
--- a/configure.ac
+++ b/configure.ac
@@ -501,7 +501,8 @@ dnl WANT_GMP_INTERNALS is defined. Only the GMP public API should be used
 dnl by default, in particular by binary distributions. Moreover the check
 dnl below may yield an incorrect result as libtool isn't used in configure
 dnl (see above).
-AC_CHECK_FUNCS([__gmpn_rootrem])
+dnl Same for __gmpn_sbpi1_divappr_q.
+AC_CHECK_FUNCS([__gmpn_rootrem __gmpn_sbpi1_divappr_q])
 
 dnl End of tests which need to link with GMP.
 fi
diff --git a/src/mparam_h.in b/src/mparam_h.in
index 1ba88f42a..371ba7f9d 100644
--- a/src/mparam_h.in
+++ b/src/mparam_h.in
@@ -77,7 +77,6 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
 #define MPFR_TUNE_CASE "src/hppa/mparam.h"
 #include "hppa/mparam.h"
 
-/* __mips64? __mips? */
 #else
 #define MPFR_TUNE_CASE "default"
 #endif
diff --git a/src/mpfr-gmp.h b/src/mpfr-gmp.h
index d916d6bed..e6c9a0614 100644
--- a/src/mpfr-gmp.h
+++ b/src/mpfr-gmp.h
@@ -314,6 +314,88 @@ __MPFR_DECLSPEC void mpfr_tmp_free _MPFR_PROTO ((struct tmp_marker *));
 #define TMP_MARK(m) (tmp_marker = 0)
 #define TMP_FREE(m) mpfr_tmp_free (tmp_marker)
 
+/* invert_limb macro, copied from GMP 5.0.2, file gmp-impl.h.
+   It returns invxl = floor((B^2-1)/xl)-B, where B=2^BITS_PER_LIMB,
+   assuming the most significant bit of xl is set. */
+#undef invert_limb
+#define invert_limb(invxl,xl)                             \
+  do {                                                    \
+    mp_limb_t dummy;                                      \
+    MPFR_ASSERTD ((xl) != 0);                             \
+    udiv_qrnnd (invxl, dummy, ~(xl), ~(mp_limb_t)0, xl);  \
+  } while (0)
+
+/* invert_pi1 macro, adapted from GMP 5.0.2, file gmp-impl.h.
+   It returns dinv = floor((B^3-1)/(d1*B+d0))-B, where B=2^BITS_PER_LIMB,
+   assuming the most significant bit of d1 is set. */
+typedef struct {mp_limb_t inv32;} gmp_pi1_t;
+#undef invert_pi1
+#define invert_pi1(dinv, d1, d0)				\
+  do {								\
+    mp_limb_t v, p, t1, t0, mask;				\
+    invert_limb (v, d1);					\
+    p = d1 * v;							\
+    p += d0;							\
+    if (p < d0)							\
+      {								\
+	v--;							\
+	mask = -(p >= d1);					\
+	p -= d1;						\
+	v += mask;						\
+	p -= mask & d1;						\
+      }								\
+    umul_ppmm (t1, t0, d0, v);					\
+    p += t1;							\
+    if (p < t1)							\
+      {								\
+        v--;							\
+	if (MPFR_UNLIKELY (p >= d1))                            \
+	  {							\
+	    if (p > d1 || t0 >= d0)				\
+	      v--;						\
+	  }							\
+      }								\
+    (dinv).inv32 = v;                                           \
+  } while (0)
+
+/* udiv_qr_3by2 macro, adapted from GMP 5.0.2, file gmp-impl.h.
+   Compute quotient the quotient and remainder for n / d. Requires d
+   >= B^2 / 2 and n < d B. dinv is the inverse
+
+     floor ((B^3 - 1) / (d0 + d1 B)) - B.
+
+   NOTE: Output variables are updated multiple times. Only some inputs
+   and outputs may overlap.
+*/
+#undef udiv_qr_3by2
+#define udiv_qr_3by2(q, r1, r0, n2, n1, n0, d1, d0, dinv)		\
+  do {									\
+    mp_limb_t _q0, _t1, _t0, _mask;					\
+    umul_ppmm ((q), _q0, (n2), (dinv));					\
+    add_ssaaaa ((q), _q0, (q), _q0, (n2), (n1));			\
+									\
+    /* Compute the two most significant limbs of n - q'd */		\
+    (r1) = (n1) - (d1) * (q);						\
+    (r0) = (n0);							\
+    sub_ddmmss ((r1), (r0), (r1), (r0), (d1), (d0));			\
+    umul_ppmm (_t1, _t0, (d0), (q));					\
+    sub_ddmmss ((r1), (r0), (r1), (r0), _t1, _t0);			\
+    (q)++;								\
+									\
+    /* Conditionally adjust q and the remainders */			\
+    _mask = - (mp_limb_t) ((r1) >= _q0);				\
+    (q) += _mask;							\
+    add_ssaaaa ((r1), (r0), (r1), (r0), _mask & (d1), _mask & (d0));	\
+    if (MPFR_UNLIKELY ((r1) >= (d1)))					\
+      {									\
+	if ((r1) > (d1) || (r0) >= (d0))				\
+	  {								\
+	    (q)++;							\
+	    sub_ddmmss ((r1), (r0), (r1), (r0), (d1), (d0));		\
+	  }								\
+      }									\
+  } while (0)
+
 #if defined (__cplusplus)
 }
 #endif
diff --git a/src/mulders.c b/src/mulders.c
index fae93d9ce..5110f2674 100644
--- a/src/mulders.c
+++ b/src/mulders.c
@@ -189,6 +189,7 @@ static short divhigh_ktab[] = {MPFR_DIVHIGH_TAB};
 #define MPFR_DIVHIGH_TAB_SIZE (sizeof(divhigh_ktab) / sizeof(divhigh_ktab[0]))
 #endif
 
+#if !(defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q))
 /* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
    with the most significant limb of the quotient as return value (0 or 1).
    Assumes the most significant bit of D is set. Clobbers N.
@@ -200,8 +201,6 @@ mpfr_divhigh_n_basecase (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
 {
   mp_limb_t qh, d1, d0, dinv, q2, q1, q0;
   gmp_pi1_t dinv2;
-  mp_size_t n0 = n;
-  mp_ptr np0 = np;
 
   np += n;
 
@@ -214,7 +213,7 @@ mpfr_divhigh_n_basecase (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
 
   if (n == 1)
     {
-      dinv = __MPN(invert_limb) (d1);
+      invert_limb (dinv, d1);
       umul_ppmm (q1, q0, np[0], dinv);
       qp[0] = np[0] + q1;
       return qh;
@@ -227,30 +226,30 @@ mpfr_divhigh_n_basecase (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
   while (n > 1)
     {
       /* Invariant: it remains to reduce n limbs from N (in addition to the
-	 initial low n limbs).
-	 Since n >= 2 here, necessarily we had n >= 2 initially, which means
-	 that in addition to the limb np[n-1] to reduce, we have at least 2
-	 extra limbs, thus accessing np[n-3] is valid. */
+         initial low n limbs).
+         Since n >= 2 here, necessarily we had n >= 2 initially, which means
+         that in addition to the limb np[n-1] to reduce, we have at least 2
+         extra limbs, thus accessing np[n-3] is valid. */
 
       /* warning: we can have np[n-1]=d1 and np[n-2]=d0, but since {np,n} < D,
-	 the largest possible partial quotient is B-1 */
+         the largest possible partial quotient is B-1 */
       if (MPFR_UNLIKELY(np[n - 1] == d1 && np[n - 2] == d0))
-	q2 = ~ (mp_limb_t) 0;
+        q2 = ~ (mp_limb_t) 0;
       else
-	udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
-		      d1, d0, dinv2.inv32);
+        udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
+                      d1, d0, dinv2.inv32);
       /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
-	 we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
-	 thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
-	 and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
-	 thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
-	 which proves that at most one correction is needed */
+         we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
+         thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
+         and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
+         thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
+         which proves that at most one correction is needed */
       q0 = mpn_submul_1 (np - 1, dp, n, q2);
       if (MPFR_UNLIKELY(q0 > np[n - 1]))
-	{
-	  mpn_add_n (np - 1, np - 1, dp, n);
+        {
+          mpn_add_n (np - 1, np - 1, dp, n);
           q2 --;
-	}
+        }
       qp[--n] = q2;
       dp ++;
     }
@@ -277,6 +276,7 @@ mpfr_divhigh_n_basecase (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
 
   return qh;
 }
+#endif
 
 /* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
    with the most significant limb of the quotient as return value (0 or 1).
@@ -297,11 +297,19 @@ mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
   MPFR_ASSERTD (MPFR_MULHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */
   k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3);
 
-  /* for k=n, we use a full division (mpn_divrem) */
-
   if (k == 0)
+#if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
+  {
+    gmp_pi1_t dinv2;
+    invert_pi1 (dinv2, dp[n - 1], dp[n - 2]);
+    return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32);
+  }
+#else /* use our own code for base-case short division */
     return mpfr_divhigh_n_basecase (qp, np, dp, n);
+#endif
   else if (k == n)
+    /* for k=n, we use a division with remainder (mpn_divrem),
+     which computes the exact quotient */
     return mpn_divrem (qp, 0, np, 2 * n, dp, n);
 
   MPFR_ASSERTD ((n+4)/2 <= k && k < n); /* bounds from [1] */
diff --git a/tune/tuneup.c b/tune/tuneup.c
index 14810584c..1a0c4ab37 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -864,10 +864,15 @@ tune_div_mulders_upto (mp_size_t n)
   tbest = mpfr_speed_measure (speed_mpfr_divhigh, &s, "mpfr_divhigh");
 
   /* Check k == 0, i.e., mpfr_divhigh_n_basecase */
-  divhigh_ktab[n] = 0;
-  t = mpfr_speed_measure (speed_mpfr_divhigh, &s, "mpfr_divhigh");
-  if (t * TOLERANCE < tbest)
-    kbest = 0, tbest = t;
+#if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
+  if (n > 2) /* mpn_sbpi1_divappr_q requires dn > 2 */
+#endif
+    {
+      divhigh_ktab[n] = 0;
+      t = mpfr_speed_measure (speed_mpfr_divhigh, &s, "mpfr_divhigh");
+      if (t * TOLERANCE < tbest)
+        kbest = 0, tbest = t;
+    }
 
   /* Check Mulders */
   step = 1 + n / (2 * MAX_STEPS);
@@ -948,7 +953,7 @@ tune_div_mulders (FILE *f)
       if (k != MPFR_DIVHIGH_TAB_SIZE - 1)
         fputc (',', f);
       if ((k+1) % 16 == 0)
-	fprintf (f, " /*%u-%u*/ \\\n ", k - 15, k);
+	fprintf (f, " /*%zu-%zu*/ \\\n ", k - 15, k);
       if (verbose)
         putchar ('.');
     }