From 85737c3a04b0177289a86354ca147945fb95e83d Mon Sep 17 00:00:00 2001
From: zimmerma <zimmerma@280ebfd0-de03-0410-8827-d642c229c3f4>
Date: Mon, 16 Jan 2017 16:49:01 +0000
Subject: [src/div.c] new variant of mpfr_div2_approx()

git-svn-id: svn://scm.gforge.inria.fr/svn/mpfr/trunk@11203 280ebfd0-de03-0410-8827-d642c229c3f4
---
 src/div.c | 143 +++++++++++++++++++-------------------------------------------
 1 file changed, 43 insertions(+), 100 deletions(-)

(limited to 'src/div.c')

diff --git a/src/div.c b/src/div.c
index 2fcb419bf..7cafef917 100644
--- a/src/div.c
+++ b/src/div.c
@@ -37,61 +37,22 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
 
 #include "invert_limb.h"
 
-#if 1
-/* Given u = u1*B+u0 < d = d1*B+d0 with d normalized (high bit of d1 set),
-   put in v = v1*B+v0 an approximation of floor(u*B^2/d), with:
-   B = 2^GMP_NUMB_BITS and v <= floor(u*B^2/d) <= v + 16.
-   Note: this function requires __gmpfr_invert_limb (from invert_limb.h)
-   which is only provided so far for 64-bit limb. */
-static void
-mpfr_div2_approx (mpfr_limb_ptr v1, mpfr_limb_ptr v0,
-                  mp_limb_t u1, mp_limb_t u0,
-                  mp_limb_t d1, mp_limb_t d0)
-{
-  mp_limb_t x, y, dummy, z2, z1, z0;
-
-  if (MPFR_UNLIKELY(d1 + 1 == MPFR_LIMB_ZERO))
-    x = 0;
-  else
-    __gmpfr_invert_limb (x, d1 + 1); /* B + x = floor((B^2-1)/(d1+1)) */
-  umul_ppmm (y, dummy, u1, x);
-  y += u1;
-  /* now y = floor(B*u1/d1) with y < B*u1/d1, thus even when
-     u1=d1, y < B */
-  umul_ppmm (dummy, z0, y, d0);
-  umul_ppmm (z2, z1, y, d1);
-  z1 += dummy;
-  z2 += (z1 < dummy);
-  z1 += (z0 != 0);
-  z2 += (z1 == 0 && z0 != 0);
-  /* now z = z2*B+z1 = ceil(y*d/B), and should cancel with u */
-  sub_ddmmss (z2, z1, u1, u0, z2, z1);
-  *v1 = y;
-  /* y*B + (B+x)*(z2*B+z1)/B */
-  umul_ppmm (*v0, dummy, x, z1);
-  /* add z1 */
-  *v0 += z1;
-  *v1 += (*v0 < z1);
-  /* add (B+x)*z2 */
-  while (z2--)
-    {
-      *v0 += x;
-      *v1 += 1 + (*v0 < x);
-    }
-}
-#else
-/* the following is some experimental code, not fully proven correct,
-   and which does not seem to be faster, unless we find a way to speed up
-   the while loop (if 'while' is replaced by 'if' it becomes faster than
-   the above code, but is then wrong) */
+/* Given u = u1*B+u0 < v = v1*B+v0 with v normalized (high bit of v1 set),
+   put in q = Q1*B+Q0 an approximation of floor(u*B^2/v), with:
+   B = 2^GMP_NUMB_BITS and q <= floor(u*B^2/v) <= q + 21.
+   Note: this function requires __gmpfr_invert_limb_approx (from invert_limb.h)
+   which is only provided so far for 64-bit limb.
+   Note: __gmpfr_invert_limb_approx can be replaced by __gmpfr_invert_limb,
+   in that case the bound 21 reduces to 16. */
 static void
 mpfr_div2_approx (mpfr_limb_ptr Q1, mpfr_limb_ptr Q0,
                   mp_limb_t u1, mp_limb_t u0,
                   mp_limb_t v1, mp_limb_t v0)
 {
-  mp_limb_t inv, q1, q0, r2, r1, r0, cy;
+  mp_limb_t inv, q1, q0, r1, r0, cy, xx, yy;
 
-  /* first compute an approximation of q1 */
+  /* first compute an approximation of q1, using a lower approximation of
+     B^2/(v1+1) - B */
   if (MPFR_UNLIKELY(v1 == MPFR_LIMB_MAX))
     inv = MPFR_LIMB_ZERO;
   else
@@ -99,63 +60,45 @@ mpfr_div2_approx (mpfr_limb_ptr Q1, mpfr_limb_ptr Q0,
   /* now inv <= B^2/(v1+1) - B */
   umul_ppmm (q1, q0, u1, inv);
   q1 += u1;
+  /* now q1 <= u1*B/(v1+1) < (u1*B+u0)*B/(v1*B+v0) */
 
-  /* we have q1 <= floor(u1*2^GMP_NUMB_BITS/v1) <= q1 + 2 */
+  /* compute q1*(v1*B+v0) into r1:r0:yy and subtract from u1:u0:0 */
+  umul_ppmm (r1, r0, q1, v1);
+  umul_ppmm (xx, yy, q1, v0);
 
-  umul_ppmm (r2, r1, q1, v1);
-  umul_ppmm (cy, r0, q1, v0);
+  ADD_LIMB (r0, xx, cy);
   r1 += cy;
-  r2 += (r1 < cy);
-
-  /* Now r2:r1:r0 = q1 * v1:v0. While q1*v1 <= u1:0, it might be that
-     r2:r1 > u1:u0. First subtract r2:r1:r0 from u1:u0:0, with result
-     in r2:r1:r0. */
-  r0 = -r0;
-  r1 = u0 - r1 - (r0 != 0);
-  r2 = u1 - r2 - (r1 > u0 || (r1 == u0 && r0 != 0));
-
-  /* u1:u0:0 - q1 * v1:v0 >= u1*B^2 - q1*(v1*B+B-1) >= -q1*B > -B^2
-     thus r2 can be 0 or -1 here. */
-  if (r2 & MPFR_LIMB_HIGHBIT) /* correction performed at most two times */
-    {
-      ADD_LIMB(r0, v0, cy); /* r0 += v0; cy = r0 < v0 */
-      ADD_LIMB(r1, v1, cy);
-      r2 += cy;
-      q1 --;
-      if (r2 & MPFR_LIMB_HIGHBIT)
-        {
-          ADD_LIMB(r0, v0, cy); /* r0 += v0; cy = r0 < v0 */
-          ADD_LIMB(r1, v1, cy);
-          r2 += cy;
-          q1 --;
-        }
-    }
-  else
+
+  /* we ignore yy below, but first increment r0, to ensure we get a lower
+     approximation of the remainder */
+  r0 += (yy != 0);
+  r0 = u0 - r0;
+  r1 = u1 - r1 - (r0 > u0);
+
+  /* r1:r0 should be nonnegative */
+  MPFR_ASSERTD((r1 & MPFR_LIMB_HIGHBIT) == 0);
+
+  /* the second quotient limb is approximated by (r1*B^2+r0*B) / v1,
+     and since (B+inv)/B approximates B/v1, this is in turn approximated
+     by (r1*B+r0)*(B+inv)/B = r1*B*r1*inv+r0+(r0*inv/B) */
+
+  q0 = r0;
+  q1 += r1;
+  /* add floor(r0*inv/B) to q0 */
+  umul_ppmm (xx, yy, r0, inv);
+  ADD_LIMB (q0, xx, cy);
+  q1 += cy;
+  while (r1) /* the number of loops is at most 4 */
     {
-      /* experimentally, the number of loops is <= 5 */
-      while (r2 || r1 > v1 || (r1 == v1 && r0 >= v0))
-        {
-          r2 -= (r1 < v1 || (r1 == v1 && r0 < v0));
-          r1 -= v1 + (r0 < v0);
-          r0 -= v0;
-          q1 ++;
-        }
+      /* add B+inv */
+      q0 += inv;
+      q1 += (q0 < inv);
+      r1 --;
     }
 
-  /* now u1:u0:0 = q1 * d1:d0 + r1:r0, with 0 <= r1:r0 < d1:d0 */
-  MPFR_ASSERTD(r2 == MPFR_LIMB_ZERO);
-  MPFR_ASSERTD(r1 < v1 || (r1 == v1 && r0 < v0));
-
-  /* the second quotient limb is approximated by r1*B / d1,
-     which is itself approximated by r1+(r1*inv/B) */
-  umul_ppmm (q0, r0, r1, inv);
-
   *Q1 = q1;
-  *Q0 = r1 + q0;
-
-  /* experimentally: q1:q0 <= floor(B^2*u/v) <= q1:q0 + 5 */
+  *Q0 = q0;
 }
-#endif
 
 #endif /* GMP_NUMB_BITS == 64 */
 
@@ -329,8 +272,8 @@ mpfr_div_2 (mpfr_ptr q, mpfr_srcptr u, mpfr_srcptr v, mpfr_rnd_t rnd_mode)
 #if GMP_NUMB_BITS == 64
   mpfr_div2_approx (&q1, &q0, r3, r2, v1, v0);
   /* we know q1*B+q0 is smaller or equal to the exact quotient, with
-     difference at most 16 */
-  if (MPFR_LIKELY(((q0 + 16) & (mask >> 1)) > 16))
+     difference at most 21 */
+  if (MPFR_LIKELY(((q0 + 21) & (mask >> 1)) > 21))
     sb = 1; /* result is not exact when we can round with an approximation */
   else
     {
-- 
cgit v1.2.1