3 files changed, 933 insertions, 0 deletions
diff --git a/mpn/generic/hgcd2_jacobi.c b/mpn/generic/hgcd2_jacobi.c
new file mode 100644
index 000000000..c888deb1d
--- /dev/null
+++ b/mpn/generic/hgcd2_jacobi.c
@@ -0,0 +1,356 @@
+/* hgcd2_jacobi.c
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2008, 2011 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#if GMP_NAIL_BITS > 0
+#error Nails not supported.
+#endif
+
+/* FIXME: Duplicated in hgcd2.c. Should move to gmp-impl.h, and
+   possibly be renamed. */
+static inline mp_limb_t
+div1 (mp_ptr rp,
+      mp_limb_t n0,
+      mp_limb_t d0)
+{
+  mp_limb_t q = 0;
+
+  if ((mp_limb_signed_t) n0 < 0)
+    {
+      int cnt;
+      for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++)
+	{
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  q <<= 1;
+	  if (n0 >= d0)
+	    {
+	      n0 = n0 - d0;
+	      q |= 1;
+	    }
+	  d0 = d0 >> 1;
+	  cnt--;
+	}
+    }
+  else
+    {
+      int cnt;
+      for (cnt = 0; n0 >= d0; cnt++)
+	{
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  d0 = d0 >> 1;
+	  q <<= 1;
+	  if (n0 >= d0)
+	    {
+	      n0 = n0 - d0;
+	      q |= 1;
+	    }
+	  cnt--;
+	}
+    }
+  *rp = n0;
+  return q;
+}
+
+/* Two-limb division optimized for small quotients.  */
+static inline mp_limb_t
+div2 (mp_ptr rp,
+      mp_limb_t nh, mp_limb_t nl,
+      mp_limb_t dh, mp_limb_t dl)
+{
+  mp_limb_t q = 0;
+
+  if ((mp_limb_signed_t) nh < 0)
+    {
+      int cnt;
+      for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++)
+	{
+	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
+	  dl = dl << 1;
+	}
+
+      while (cnt)
+	{
+	  q <<= 1;
+	  if (nh > dh || (nh == dh && nl >= dl))
+	    {
+	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
+	      q |= 1;
+	    }
+	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
+	  dh = dh >> 1;
+	  cnt--;
+	}
+    }
+  else
+    {
+      int cnt;
+      for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++)
+	{
+	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
+	  dl = dl << 1;
+	}
+
+      while (cnt)
+	{
+	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
+	  dh = dh >> 1;
+	  q <<= 1;
+	  if (nh > dh || (nh == dh && nl >= dl))
+	    {
+	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
+	      q |= 1;
+	    }
+	  cnt--;
+	}
+    }
+
+  rp[0] = nl;
+  rp[1] = nh;
+
+  return q;
+}
+
+int
+mpn_hgcd2_jacobi (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
+		  struct hgcd_matrix1 *M, unsigned *bitsp)
+{
+  mp_limb_t u00, u01, u10, u11;
+  unsigned bits = *bitsp;
+
+  if (ah < 2 || bh < 2)
+    return 0;
+
+  if (ah > bh || (ah == bh && al > bl))
+    {
+      sub_ddmmss (ah, al, ah, al, bh, bl);
+      if (ah < 2)
+	return 0;
+
+      u00 = u01 = u11 = 1;
+      u10 = 0;
+      bits = mpn_jacobi_update (bits, 1, 1);
+    }
+  else
+    {
+      sub_ddmmss (bh, bl, bh, bl, ah, al);
+      if (bh < 2)
+	return 0;
+
+      u00 = u10 = u11 = 1;
+      u01 = 0;
+      bits = mpn_jacobi_update (bits, 0, 1);
+    }
+
+  if (ah < bh)
+    goto subtract_a;
+
+  for (;;)
+    {
+      ASSERT (ah >= bh);
+      if (ah == bh)
+	goto done;
+
+      if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
+	{
+	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
+	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
+
+	  break;
+	}
+
+      /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
+	 1), affecting the second column of M. */
+      ASSERT (ah > bh);
+      sub_ddmmss (ah, al, ah, al, bh, bl);
+
+      if (ah < 2)
+	goto done;
+
+      if (ah <= bh)
+	{
+	  /* Use q = 1 */
+	  u01 += u00;
+	  u11 += u10;
+	  bits = mpn_jacobi_update (bits, 1, 1);
+	}
+      else
+	{
+	  mp_limb_t r[2];
+	  mp_limb_t q = div2 (r, ah, al, bh, bl);
+	  al = r[0]; ah = r[1];
+	  if (ah < 2)
+	    {
+	      /* A is too small, but q is correct. */
+	      u01 += q * u00;
+	      u11 += q * u10;
+	      bits = mpn_jacobi_update (bits, 1, q & 3);
+	      goto done;
+	    }
+	  q++;
+	  u01 += q * u00;
+	  u11 += q * u10;
+	  bits = mpn_jacobi_update (bits, 1, q & 3);
+	}
+    subtract_a:
+      ASSERT (bh >= ah);
+      if (ah == bh)
+	goto done;
+
+      if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
+	{
+	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
+	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
+
+	  goto subtract_a1;
+	}
+
+      /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
+	 1), affecting the first column of M. */
+      sub_ddmmss (bh, bl, bh, bl, ah, al);
+
+      if (bh < 2)
+	goto done;
+
+      if (bh <= ah)
+	{
+	  /* Use q = 1 */
+	  u00 += u01;
+	  u10 += u11;
+	  bits = mpn_jacobi_update (bits, 0, 1);
+	}
+      else
+	{
+	  mp_limb_t r[2];
+	  mp_limb_t q = div2 (r, bh, bl, ah, al);
+	  bl = r[0]; bh = r[1];
+	  if (bh < 2)
+	    {
+	      /* B is too small, but q is correct. */
+	      u00 += q * u01;
+	      u10 += q * u11;
+	      bits = mpn_jacobi_update (bits, 0, q & 3);
+	      goto done;
+	    }
+	  q++;
+	  u00 += q * u01;
+	  u10 += q * u11;
+	  bits = mpn_jacobi_update (bits, 0, q & 3);
+	}
+    }
+
+  /* NOTE: Since we discard the least significant half limb, we don't
+     get a truly maximal M (corresponding to |a - b| <
+     2^{GMP_LIMB_BITS +1}). */
+  /* Single precision loop */
+  for (;;)
+    {
+      ASSERT (ah >= bh);
+      if (ah == bh)
+	break;
+
+      ah -= bh;
+      if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
+	break;
+
+      if (ah <= bh)
+	{
+	  /* Use q = 1 */
+	  u01 += u00;
+	  u11 += u10;
+	  bits = mpn_jacobi_update (bits, 1, 1);
+	}
+      else
+	{
+	  mp_limb_t r;
+	  mp_limb_t q = div1 (&r, ah, bh);
+	  ah = r;
+	  if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
+	    {
+	      /* A is too small, but q is correct. */
+	      u01 += q * u00;
+	      u11 += q * u10;
+	      bits = mpn_jacobi_update (bits, 1, q & 3);
+	      break;
+	    }
+	  q++;
+	  u01 += q * u00;
+	  u11 += q * u10;
+	  bits = mpn_jacobi_update (bits, 1, q & 3);
+	}
+    subtract_a1:
+      ASSERT (bh >= ah);
+      if (ah == bh)
+	break;
+
+      bh -= ah;
+      if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
+	break;
+
+      if (bh <= ah)
+	{
+	  /* Use q = 1 */
+	  u00 += u01;
+	  u10 += u11;
+	  bits = mpn_jacobi_update (bits, 0, 1);
+	}
+      else
+	{
+	  mp_limb_t r;
+	  mp_limb_t q = div1 (&r, bh, ah);
+	  bh = r;
+	  if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
+	    {
+	      /* B is too small, but q is correct. */
+	      u00 += q * u01;
+	      u10 += q * u11;
+	      bits = mpn_jacobi_update (bits, 0, q & 3);
+	      break;
+	    }
+	  q++;
+	  u00 += q * u01;
+	  u10 += q * u11;
+	  bits = mpn_jacobi_update (bits, 0, q & 3);
+	}
+    }
+
+ done:
+  M->u[0][0] = u00; M->u[0][1] = u01;
+  M->u[1][0] = u10; M->u[1][1] = u11;
+  *bitsp = bits;
+
+  return 1;
+}
diff --git a/mpn/generic/hgcd_jacobi.c b/mpn/generic/hgcd_jacobi.c
new file mode 100644
index 000000000..2dce43b99
--- /dev/null
+++ b/mpn/generic/hgcd_jacobi.c
@@ -0,0 +1,235 @@
+/* hgcd_jacobi.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003, 2004, 2005, 2008, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* This file is almost a copy of hgcd.c, with some added calls to
+   mpn_jacobi_update */ 
+
+struct hgcd_jacobi_ctx
+{
+  struct hgcd_matrix *M;
+  unsigned *bitsp;
+};
+
+static void
+hgcd_jacobi_hook (void *p, mp_srcptr gp, mp_size_t gn,
+		  mp_srcptr qp, mp_size_t qn, int d)
+{
+  ASSERT (!gp);
+  ASSERT (d >= 0);
+
+  MPN_NORMALIZE (qp, qn);
+  if (qn > 0)
+    {
+      struct hgcd_jacobi_ctx *ctx = (struct hgcd_jacobi_ctx *) p;
+      /* NOTES: This is a bit ugly. A tp area is passed to
+	 gcd_subdiv_step, which stores q at the start of that area. We
+	 now use the rest. */
+      mp_ptr tp = (mp_ptr) qp + qn;
+
+      mpn_hgcd_matrix_update_q (ctx->M, qp, qn, d, tp);
+      *ctx->bitsp = mpn_jacobi_update (*ctx->bitsp, d, qp[0] & 3);
+    }
+}
+
+/* Perform a few steps, using some of mpn_hgcd2, subtraction and
+   division. Reduces the size by almost one limb or more, but never
+   below the given size s. Return new size for a and b, or 0 if no
+   more steps are possible.
+
+   If hgcd2 succeds, needs temporary space for hgcd_matrix_mul_1, M->n
+   limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
+   fails, needs space for the quotient, qn <= n - s + 1 limbs, for and
+   hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
+   resulting size of M.
+
+   If N is the input size to the calling hgcd, then s = floor(N/2) +
+   1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1
+   < N, so N is sufficient.
+*/
+
+static mp_size_t
+hgcd_jacobi_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
+		  struct hgcd_matrix *M, unsigned *bitsp, mp_ptr tp)
+{
+  struct hgcd_matrix1 M1;
+  mp_limb_t mask;
+  mp_limb_t ah, al, bh, bl;
+  mp_size_t an, bn, qn;
+  int col;
+
+  ASSERT (n > s);
+
+  mask = ap[n-1] | bp[n-1];
+  ASSERT (mask > 0);
+
+  if (n == s + 1)
+    {
+      if (mask < 4)
+	goto subtract;
+
+      ah = ap[n-1]; al = ap[n-2];
+      bh = bp[n-1]; bl = bp[n-2];
+    }
+  else if (mask & GMP_NUMB_HIGHBIT)
+    {
+      ah = ap[n-1]; al = ap[n-2];
+      bh = bp[n-1]; bl = bp[n-2];
+    }
+  else
+    {
+      int shift;
+
+      count_leading_zeros (shift, mask);
+      ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+      al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+      bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+      bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
+    }
+
+  /* Try an mpn_hgcd2 step */
+  if (mpn_hgcd2_jacobi (ah, al, bh, bl, &M1, bitsp))
+    {
+      /* Multiply M <- M * M1 */
+      mpn_hgcd_matrix_mul_1 (M, &M1, tp);
+
+      /* Can't swap inputs, so we need to copy. */
+      MPN_COPY (tp, ap, n);
+      /* Multiply M1^{-1} (a;b) */
+      return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n);
+    }
+
+ subtract:
+  {
+    struct hgcd_jacobi_ctx ctx;
+    ctx.M = M;
+    ctx.bitsp = bitsp;
+    
+    return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_jacobi_hook, &ctx, tp);
+  }
+}
+
+/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
+   with elements of size at most (n+1)/2 - 1. Returns new size of a,
+   b, or zero if no reduction is possible. */
+
+/* Same scratch requirements as for mpn_hgcd. */
+mp_size_t
+mpn_hgcd_jacobi (mp_ptr ap, mp_ptr bp, mp_size_t n,
+		 struct hgcd_matrix *M, unsigned *bitsp, mp_ptr tp)
+{
+  mp_size_t s = n/2 + 1;
+
+  mp_size_t nn;
+  int success = 0;
+
+  if (n <= s)
+    /* Happens when n <= 2, a fairly uninteresting case but exercised
+       by the random inputs of the testsuite. */
+    return 0;
+
+  ASSERT ((ap[n-1] | bp[n-1]) > 0);
+
+  ASSERT ((n+1)/2 - 1 < M->alloc);
+
+  if (ABOVE_THRESHOLD (n, HGCD_THRESHOLD))
+    {
+      mp_size_t n2 = (3*n)/4 + 1;
+      mp_size_t p = n/2;
+
+      nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, M, bitsp, tp);
+      if (nn > 0)
+	{
+	  /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
+	     = 2 (n - 1) */
+	  n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
+	  success = 1;
+	}
+      while (n > n2)
+	{
+	  /* Needs n + 1 storage */
+	  nn = hgcd_jacobi_step (n, ap, bp, s, M, bitsp, tp);
+	  if (!nn)
+	    return success ? n : 0;
+	  n = nn;
+	  success = 1;
+	}
+
+      if (n > s + 2)
+	{
+	  struct hgcd_matrix M1;
+	  mp_size_t scratch;
+
+	  p = 2*s - n + 1;
+	  scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
+
+	  mpn_hgcd_matrix_init(&M1, n - p, tp);
+	  nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, &M1, bitsp, tp + scratch);
+	  if (nn > 0)
+	    {
+	      /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
+	      ASSERT (M->n + 2 >= M1.n);
+
+	      /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
+		 then either q or q + 1 is a correct quotient, and M1 will
+		 start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
+		 rules out the case that the size of M * M1 is much
+		 smaller than the expected M->n + M1->n. */
+
+	      ASSERT (M->n + M1.n < M->alloc);
+
+	      /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
+		 = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
+	      n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
+
+	      /* We need a bound for of M->n + M1.n. Let n be the original
+		 input size. Then
+
+		 ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2
+
+		 and it follows that
+
+		 M.n + M1.n <= ceil(n/2) + 1
+
+		 Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the
+		 amount of needed scratch space. */
+	      mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
+	      success = 1;
+	    }
+	}
+    }
+
+  for (;;)
+    {
+      /* Needs s+3 < n */
+      nn = hgcd_jacobi_step (n, ap, bp, s, M, bitsp, tp);
+      if (!nn)
+	return success ? n : 0;
+
+      n = nn;
+      success = 1;
+    }
+}
diff --git a/mpn/generic/jacobi_2.c b/mpn/generic/jacobi_2.c
new file mode 100644
index 000000000..9a8a2b551
--- /dev/null
+++ b/mpn/generic/jacobi_2.c
@@ -0,0 +1,342 @@
+/* jacobi_2.c
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2008, 2010 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef JACOBI_2_METHOD
+#define JACOBI_2_METHOD 2
+#endif
+
+/* Computes (a / b) where b is odd, and a and b are otherwise arbitrary
+   two-limb numbers. */
+#if JACOBI_2_METHOD == 1
+int
+mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, unsigned bit)
+{
+  mp_limb_t ah, al, bh, bl;
+  int c;
+
+  al = ap[0];
+  ah = ap[1];
+  bl = bp[0];
+  bh = bp[1];
+
+  ASSERT (bl & 1);
+
+  bl = ((bh << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK) | (bl >> 1);
+  bh >>= 1;
+
+  if ( (bh | bl) == 0)
+    return 1 - 2*(bit & 1);
+
+  if ( (ah | al) == 0)
+    return 0;
+
+  if (al == 0)
+    {
+      al = ah;
+      ah = 0;
+      bit ^= GMP_NUMB_BITS & (bl ^ (bl >> 1));
+    }
+  count_trailing_zeros (c, al);
+  bit ^= c & (bl ^ (bl >> 1));
+
+  c++;
+  if (UNLIKELY (c == GMP_NUMB_BITS))
+    {
+      al = ah;
+      ah = 0;
+    }
+  else
+    {
+      al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
+      ah >>= c;
+    }
+  while ( (ah | bh) > 0)
+    {
+      mp_limb_t th, tl;
+      mp_limb_t bgta;
+
+      sub_ddmmss (th, tl, ah, al, bh, bl);
+      if ( (tl | th) == 0)
+	return 0;
+
+      bgta = LIMB_HIGHBIT_TO_MASK (th);
+
+      /* If b > a, invoke reciprocity */
+      bit ^= (bgta & al & bl);
+
+      /* b <-- min (a, b) */
+      add_ssaaaa (bh, bl, bh, bl, th & bgta, tl & bgta);
+
+      if ( (bh | bl) == 0)
+	return 1 - 2*(bit & 1);
+
+      /* a <-- |a - b| */
+      al = (bgta ^ tl) - bgta;
+      ah = (bgta ^ th);
+
+      if (UNLIKELY (al == 0))
+	{
+	  /* If b > a, al == 0 implies that we have a carry to
+	     propagate. */
+	  al = ah - bgta;
+	  ah = 0;
+	  bit ^= GMP_NUMB_BITS & (bl ^ (bl >> 1));
+	}
+      count_trailing_zeros (c, al);
+      c++;
+      bit ^= c & (bl ^ (bl >> 1));
+
+      if (UNLIKELY (c == GMP_NUMB_BITS))
+	{
+	  al = ah;
+	  ah = 0;
+	}
+      else
+	{
+	  al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
+	  ah >>= c;
+	}
+    }
+
+  ASSERT (bl > 0);
+
+  while ( (al | bl) & GMP_LIMB_HIGHBIT)
+    {
+      /* Need an extra comparison to get the mask. */
+      mp_limb_t t = al - bl;
+      mp_limb_t bgta = - (bl > al);
+
+      if (t == 0)
+	return 0;
+
+      /* If b > a, invoke reciprocity */
+      bit ^= (bgta & al & bl);
+
+      /* b <-- min (a, b) */
+      bl += (bgta & t);
+
+      /* a <-- |a - b| */
+      al = (t ^ bgta) - bgta;
+
+      /* Number of trailing zeros is the same no matter if we look at
+       * t or a, but using t gives more parallelism. */
+      count_trailing_zeros (c, t);
+      c ++;
+      /* (2/b) = -1 if b = 3 or 5 mod 8 */
+      bit ^= c & (bl ^ (bl >> 1));
+
+      if (UNLIKELY (c == GMP_NUMB_BITS))
+	return 1 - 2*(bit & 1);
+
+      al >>= c;
+    }
+
+  /* Here we have a little impedance mismatch. Better to inline it? */
+  return mpn_jacobi_base (2*al+1, 2*bl+1, bit << 1);
+}
+#elif JACOBI_2_METHOD == 2
+int
+mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, unsigned bit)
+{
+  mp_limb_t ah, al, bh, bl;
+  int c;
+
+  al = ap[0];
+  ah = ap[1];
+  bl = bp[0];
+  bh = bp[1];
+
+  ASSERT (bl & 1);
+
+  /* Use bit 1. */
+  bit <<= 1;
+
+  if (bh == 0 && bl == 1)
+    /* (a|1) = 1 */
+    return 1 - (bit & 2);
+
+  if (al == 0)
+    {
+      if (ah == 0)
+	/* (0|b) = 0, b > 1 */
+	return 0;
+
+      count_trailing_zeros (c, ah);
+      bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1));
+
+      al = bl;
+      bl = ah >> c;
+
+      if (bl == 1)
+	/* (1|b) = 1 */
+	return 1 - (bit & 2);
+
+      ah = bh;
+
+      bit ^= al & bl;
+
+      goto b_reduced;
+    }
+  if ( (al & 1) == 0)
+    {
+      count_trailing_zeros (c, al);
+
+      al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
+      ah >>= c;
+      bit ^= (c << 1) & (bl ^ (bl >> 1));
+    }
+  if (ah == 0)
+    {
+      if (bh > 0)
+	{
+	  bit ^= al & bl;
+	  MP_LIMB_T_SWAP (al, bl);
+	  ah = bh;
+	  goto b_reduced;
+	}
+      goto ab_reduced;
+    }
+
+  while (bh > 0)
+    {
+      /* Compute (a|b) */
+      while (ah > bh)
+	{
+	  sub_ddmmss (ah, al, ah, al, bh, bl);
+	  if (al == 0)
+	    {
+	      count_trailing_zeros (c, ah);
+	      bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1));
+
+	      al = bl;
+	      bl = ah >> c;
+	      ah = bh;
+
+	      bit ^= al & bl;
+	      goto b_reduced;
+	    }
+	  count_trailing_zeros (c, al);
+	  bit ^= (c << 1) & (bl ^ (bl >> 1));
+	  al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
+	  ah >>= c;
+	}
+      if (ah == bh)
+	goto cancel_hi;
+
+      if (ah == 0)
+	{
+	  bit ^= al & bl;
+	  MP_LIMB_T_SWAP (al, bl);
+	  ah = bh;
+	  break;
+	}
+
+      bit ^= al & bl;
+
+      /* Compute (b|a) */
+      while (bh > ah)
+	{
+	  sub_ddmmss (bh, bl, bh, bl, ah, al);
+	  if (bl == 0)
+	    {
+	      count_trailing_zeros (c, bh);
+	      bit ^= ((GMP_NUMB_BITS + c) << 1) & (al ^ (al >> 1));
+
+	      bl = bh >> c;
+	      bit ^= al & bl;
+	      goto b_reduced;
+	    }
+	  count_trailing_zeros (c, bl);
+	  bit ^= (c << 1) & (al ^ (al >> 1));
+	  bl = ((bh << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (bl >> c);
+	  bh >>= c;
+	}
+      bit ^= al & bl;
+
+      /* Compute (a|b) */
+      if (ah == bh)
+	{
+	cancel_hi:
+	  if (al < bl)
+	    {
+	      MP_LIMB_T_SWAP (al, bl);
+	      bit ^= al & bl;
+	    }
+	  al -= bl;
+	  if (al == 0)
+	    return 0;
+
+	  count_trailing_zeros (c, al);
+	  bit ^= (c << 1) & (bl ^ (bl >> 1));
+	  al >>= c;
+
+	  if (al == 1)
+	    return 1 - (bit & 2);
+
+	  MP_LIMB_T_SWAP (al, bl);
+	  bit ^= al & bl;
+	  break;
+	}
+    }
+
+ b_reduced:
+  /* Compute (a|b), with b a single limb. */
+  ASSERT (bl & 1);
+
+  if (bl == 1)
+    /* (a|1) = 1 */
+    return 1 - (bit & 2);
+
+  while (ah > 0)
+    {
+      ah -= (al < bl);
+      al -= bl;
+      if (al == 0)
+	{
+	  if (ah == 0)
+	    return 0;
+	  count_trailing_zeros (c, ah);
+	  bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1));
+	  al = ah >> c;
+	  goto ab_reduced;
+	}
+      count_trailing_zeros (c, al);
+
+      al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
+      ah >>= c;
+      bit ^= (c << 1) & (bl ^ (bl >> 1));
+    }
+ ab_reduced:
+  ASSERT (bl & 1);
+  ASSERT (bl > 1);
+
+  return mpn_jacobi_base (al, bl, bit);
+}
+#else
+#error Unsupported value for JACOBI_2_METHOD
+#endif