63 files changed, 3274 insertions, 4852 deletions
diff --git a/.hgignore b/.hgignore
index 71cb23983..817593cbf 100644
--- a/.hgignore
+++ b/.hgignore
@@ -37,9 +37,9 @@ Makefile
 ^doc/version\.texi
 
 # All source files in mpn/ are either generated, or links
-^mpn/.*\.c
-^mpn/.*\.asm
-^
+^mpn/[^/]*\.c
+^mpn/[^/]*\.asm
+
 ^\.libs
 
 .*\.a
diff --git a/ChangeLog b/ChangeLog
index 22118506a..79f9b13e2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -29,6 +29,38 @@
 	(DIVEXACT_BY3_METHOD): Don't default to 0 if
 	HAVE_NATIVE_mpn_divexact_by3c.
 
+2008-09-18  Niels M�ller  <nisse@lysator.liu.se>
+
+	* mpn/generic/gcd.c (main): Added code for tuning of CHOOSE_P.
+
+	* mpn/generic/hgcd.c (mpn_hgcd_matrix_mul): Assert that inputs are
+	normalized.
+
+2008-09-17  Niels M�ller <nisse@lysator.liu.se>  <nisse@king.swox.se>
+
+	* mpn/generic/gcdext.c (mpn_gcdext): p = n/5 caused a
+	slowdown for large inputs. As a compromise, use p = n/2 for the
+	first iteration, and p = n/3 for the rest. Handle the first
+	iteration specially, since the initial u0 and u1 are trivial.
+
+	* mpn/x86_64/gmp-mparam.h (GCDEXT_DC_THRESHOLD): Reduced threshold
+	from 409 to 390.
+
+	* mpn/generic/gcdext.c (CHOOSE_P): New macro. Use p = n/5.
+	(mpn_gcdext): Use CHOOSE_P, and generalized the calculation of
+	scratch space.
+
+	* tune/tuneup.c (tune_hgcd): Use default step factor.
+
+	* mpn/x86_64/gmp-mparam.h: (GCD_DC_THRESHOLD): Reduced from 493 to
+	412.
+
+	* mpn/generic/gcd.c (CHOOSE_P): New macro, to determine the
+	split when calling hgcd. Use p = 2n/3, as that seems better than
+	the more obvious split p = n/2.
+	(mpn_gcd): Use CHOOSE_P, and generalized the calculation of
+	scratch space.
+
 2008-09-16  Torbjorn Granlund  <tege@swox.com>
 
 	* mpn/generic/toom_interpolate_7pts.c: Use new mpn_divexact_byN
@@ -55,14 +87,175 @@
 	Choose function depending on DIVEXACT_BY3_METHOD.
 	* gmp-impl.h (DIVEXACT_BY3_METHOD): Provide default.
 
+2008-09-16  Niels M�ller  <nisse@lysator.liu.se>
+
+	* mpn/generic/hgcd.c (mpn_hgcd_addmul2_n): Moved function to
+	gcdext.c, where it is used.
+	* mpn/generic/gcdext.c (addmul2_n): Moved and renamed, was
+	mpn_hgcd_addmul2_n. Made static. Deleted input normalization.
+	Deleted rn argument.
+	(mpn_gcdext): Updated calls to addmul2_n, and added assertions.
+
+	* gmp-impl.h (MPN_HGCD_MATRIX_INIT_ITCH): Increased storage by four limbs.
+	(MPN_HGCD_LEHMER_ITCH): Reduced storage by one limb.
+	(MPN_GCD_SUBDIV_STEP_ITCH): Likewise.
+	(MPN_GCD_LEHMER_N_ITCH): Likewise.
+
+	* mpn/generic/hgcd.c (mpn_hgcd_matrix_init): Use two extra limbs.
+	(hgcd_step): Use overlapping arguments to mpn_tdiv_qr.
+	(mpn_hgcd_matrix_mul): Deleted normalization code. Tigher bounds
+	for the element size of the product. Needs two extra limbs of
+	storage for the elements.
+	(mpn_hgcd_itch): Updated storage calculation.
+
+	* mpn/generic/gcd_subdiv_step.c (mpn_gcd_subdiv_step): Use
+	overlapping arguments to mpn_tdiv_qr. Use mpn_zero_p.
+
+	* mpn/generic/gcd.c (mpn_gcd): Use mpn_zero_p.
+
+2008-09-15  Niels M�ller  <nisse@lysator.liu.se>
+
+	* mpn/generic/hgcd.c (mpn_hgcd_matrix_init): Updated for deleted
+	tp pointer.
+	(hgcd_matrix_update_q): Likewise.
+	(mpn_hgcd_matrix_mul): Likewise.
+	(mpn_hgcd_itch): Updated calculation of scratch space.
+
+	* gmp-impl.h (struct hgcd_matrix): Deleted tp pointer.
+	(MPN_HGCD_MATRIX_INIT_ITCH): Reduced storage.
+	(mpn_hgcd_step, MPN_HGCD_STEP_ITCH): Deleted declarations.
+
+2008-09-15  Niels M�ller <nisse@lysator.liu.se>  <nisse@king.swox.se>
+
+	* mpn/x86_64/gmp-mparam.h (MATRIX22_STRASSEN_THRESHOLD): New
+	threshold.
+
+	* mpn/generic/hgcd.c (mpn_hgcd_matrix_mul): Use mpn_matrix22_mul.
+	(mpn_hgcd_itch): Updated calculation of scratch space. Use
+	count_leading_zeros to get the recursion depth.
+
+	* mpn/generic/gcd.c (mpn_gcd): Fixed calculation of scratch space,
+	and use mpn_hgcd_itch.
+
+2008-09-15  Niels M�ller  <nisse@lysator.liu.se>
+
+	* tune/tuneup.c (tune_matrix22_mul): New function.
+	(all): Use it.
+
+	* tune/common.c (speed_mpn_matrix22_mul): New function.
+
+	* tune/Makefile.am (TUNE_MPN_SRCS_BASIC): Added matrix22_mul.c.
+
+	* tests/mpn/t-matrix22.c: Use MATRIX22_STRASSEN_THRESHOLD to
+	select sizes for tests.
+
+	* gmp-impl.h (MATRIX22_STRASSEN_THRESHOLD): New threshold
+
+	* configure.in (gmp_mpn_functions): Added matrix22_mul.
+	* gmp-impl.h: Added declarations for mpn_matrix22_mul and related
+	functions.
+
+	* mpn/Makefile.am (nodist_EXTRA_libmpn_la_SOURCES): Added
+	matrix22_mul.c.
+	* tests/mpn/Makefile.am (check_PROGRAMS): Added t-matrix22.
+
+	* tests/mpn/t-matrix22.c: New file.
+	* mpn/generic/matrix22_mul.c: New file.
+
+2008-09-11  Niels M�ller  <nisse@king.swox.se>
+
+	* tune/tuneup.c: Updated tuning of gcdext.
+
+	* mpn/x86_64/gmp-mparam.h (GCDEXT_DC_THRESHOLD): Reduced threshold
+	from 713 to 409.
+
+2008-09-11  Niels M�ller  <nisse@lysator.liu.se>
+
+	* gmp-impl.h: Updated for gcdext changes.
+	(GCDEXT_DC_THRESHOLD): New constant, renamed from
+	GCDEXT_SCHOENHAGE_THRESHOLD.
+
+	* mpn/generic/gcdext.c (compute_v): Accept non-normalized a and b
+	as inputs.
+	(mpn_gcdext): Rewrote and simplified. Now uses the new mpn_hgcd
+	interface.
+
+	* mpn/generic/hgcd.c (mpn_hgcd_addmul2_n): Renamed from addmul2_n
+	and made non-static. Changed interface to take non-normalized
+	inputs, and only two size arguments.
+	(mpn_hgcd_matrix_mul): Simplified using new mpn_hgcd_addmul2_n.
+
+	* mpn/generic/gcdext_lehmer.c (mpn_gcdext_lehmer_itch): Deleted
+	function.
+	(mpn_gcdext_lehmer_n): Renamed from mpn_gcd_lehmer. Now takes
+	inputs of equal size. Moved the code for the division step to a
+	separate function...
+	* mpn/generic/gcdext_subdiv_step.c (mpn_gcdext_subdiv_step): New
+	file, new function.
+
+	* configure.in (gmp_mpn_functions): Added gcdext_subdiv_step.
+
 2008-09-10  Torbjorn Granlund  <tege@swox.com>
 
 	* gmp-h.in: Unconditionally include <cstdio>.
 
+2008-09-10  Niels M�ller  <nisse@lysator.liu.se>
+
+	* tune/common.c: #if:ed out speed_mpn_gcd_binary and
+	speed_mpn_gcd_accel.
+	* tune/speed.c (routine): #if:ed out mpn_gcd_binary, mpn_gcd_accel
+	and find_a.
+	* tune/Makefile.am (libspeed_la_SOURCES): Removed gcd_bin.c
+	gcd_accel.c gcd_finda_gen.c.
+	* tune/tuneup.c: Enable tuning of GCD_DC_THRESHOLD.
+
+	* mpn/generic/gcd.c (mpn_gcd): Rewrote and simplified. Now uses
+	the new mpn_hgcd interface.
+
+	* */gmp-mparam.h: Renamed GCD_SCHOENHAGE_THRESHOLD to
+	GCD_DC_THRESHOLD.
+
+	* mpn/generic/gcd_lehmer.c (mpn_gcd_lehmer_n): Renamed (was
+	mpn_gcd_lehmer). Now takes inputs of equal size.
+
+	* mpn/generic/gcd_lehmer.c (mpn_gcd_lehmer): Reintroduced gcd_2,
+	to get better performance for small inputs.
+
+	* mpn/generic/hgcd.c: Don't hardcode small HGCD_THRESHOLD.
+	* mpn/x86_64/gmp-mparam.h (HGCD_THRESHOLD): Reduced from 145 to
+	120.
+	* */gmp-mparam.h: Renamed HGCD_SCHOENHAGE_THRESHOLD to
+	HGCD_THRESHOLD.
+
 2008-09-09  Torbjorn Granlund  <tege@swox.com>
 
 	* doc/gmp.texi: Fix a typo and clarify mpn_gcdext docs.
 
+2008-09-09  Niels M�ller  <nisse@lysator.liu.se>
+
+	* tune/common.c (speed_mpn_hgcd, speed_mpn_hgcd_lehmer): Adapted
+	to new hgcd interface.
+
+	* gmp-impl.h (MPN_HGCD_LEHMER_ITCH): New macro.
+
+	* hgcd.c (mpn_hgcd_lehmer): Renamed function, from hgcd_base. Made
+	non-static.
+
+	* gcd_lehmer.c (mpn_gcd_lehmer): Use hgcd2 also for n == 2.
+
+	* gcdext_lehmer.c (mpn_gcdext_lehmer): Simplified code for
+	division step. Added proper book-keeping of swaps, which affect
+	the sign of the returned cofactor.
+
+	* tests/mpz/t-gcd.c (one_test): Display co-factor when mpn_gcdext
+	fails.
+
+	* gcd_lehmer.c (mpn_gcd_lehmer): At end of loop, need to handle
+	the special case n == 1 correctly.
+
+	* gcd_subdiv_step.c (mpn_gcd_subdiv_step): Simplified function.
+	The special cancellation logic is not needed here.
+
 2008-09-08  Torbjorn Granlund  <tege@swox.com>
 
 	* mpn/generic/invert.c: Add working but slow code.
@@ -94,6 +287,26 @@
 	* gmp-h.in (__GMP_CC): New #define.
 	(__GMP_CFLAGS): New #define.
 
+2008-09-08  Niels M�ller  <nisse@lysator.liu.se>
+
+	* tests/mpn/t-hgcd.c: Updated tests. Rewrite of hgcd_ref.
+
+	* mpn/generic/gcdext_lehmer.c (mpn_gcdext_lehmer_itch): New function.
+	(mpn_gcdext_lehmer): Various bugfixes.
+
+	* gcdext.c (mpn_gcdext): Allocate scratch space for gcdext_lehmer.
+
+	* mpn/generic/gcd_lehmer.c (gcd_2): ASSERT that inputs are odd.
+	(mpn_gcd_lehmer): Added tp argument, for scratch space. Make both
+	arguments odd before calling gcd_2.
+
+	* mpn/generic/hgcd.c (mpn_hgcd): Allow the trivial case n <= 2,
+	and return 0 immediately.
+
+	* gmp-impl.h (MPN_EXTRACT_NUMB): New macro.
+
+	* configure.in (gmp_mpn_functions): Added gcdext_lehmer.
+
 2008-09-05  Torbjorn Granlund  <tege@swox.com>
 
 	* mpn/generic/toom_interpolate_7pts.c: Use mpn_divexact_by3c instead of
@@ -856,6 +1069,12 @@
 	* mpn/generic/mul_fft.c: Optimize many scalar divisions and mod
 	operations into masks and shifts.
 	(mpn_fft_mul_modF_K): Fix a spurious ASSERT_NOCARRY.
+	(mpn_fft_belge_butterfly, mpn_fft_fft_belgeRec, mpn_fft_fft_belge,
+	mpn_fft_fft_belgeInvRec, mpn_fft_fft_belgeInv): Add Pierrick Gaudry's
+	implementation of the cache-optimized "belge" FFT code.
+	(mpn_fft_fft_sqr, mpn_fft_butterfly, mpn_fft_fft, mpn_fft_fftinv):
+	Remove.
+	(mpn_mul_fft_internal): Corresponding updates.
 
 2006-03-26  Torbjorn Granlund  <tege@swox.com>
 
@@ -1187,6 +1406,9 @@
 
 	* tests/mpz/reuse.c: Test mpz_rootrem.
 
+	From Paul Zimmermann:
+	* mpn/generic/rootrem.c: Complete rewrite.
+
 2005-10-31  Torbjorn Granlund  <tege@swox.com>
 
 	* mpz/pprime_p.c (mpz_probab_prime_p): Considerably limit trial
diff --git a/configure.in b/configure.in
index fe0584285..7eea50ee4 100644
--- a/configure.in
+++ b/configure.in
@@ -2407,8 +2407,11 @@ gmp_mpn_functions="$extra_functions					   \
   fib2_ui mod_1 mod_34lsub1 mode1o pre_divrem_1 pre_mod_1 dump		   \
   mul mul_fft mul_n mul_basecase sqr_basecase random random2 pow_1	   \
   rootrem sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \
-  bdivmod gcd_1 gcd gcdext tdiv_qr dc_divrem_n sb_divrem_mn jacbase get_d  \
-  hgcd2 hgcd qstack mullow_n mullow_basecase				   \
+  bdivmod gcd_1 gcd gcdext_1 gcdext gcd_lehmer gcd_subdiv_step \
+  gcdext_lehmer gcdext_subdiv_step \
+  tdiv_qr dc_divrem_n sb_divrem_mn jacbase get_d  \
+  matrix22_mul \
+  hgcd2 hgcd mullow_n mullow_basecase				   \
   mul_toom22 mul_toom32 mul_toom42 mul_toom62 mul_toom53 mul_toom44	   \
   toom_interpolate_5pts toom_interpolate_7pts invert binvert		   \
   sb_div_qr sb_divappr_q sb_div_q dc_div_qr dc_divappr_q dc_div_q	   \
diff --git a/gmp-h.in b/gmp-h.in
index 0488a5ad5..99ba5b3e5 100644
--- a/gmp-h.in
+++ b/gmp-h.in
@@ -1505,6 +1505,9 @@ __GMP_DECLSPEC mp_size_t mpn_gcd __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr
 #define mpn_gcd_1 __MPN(gcd_1)
 __GMP_DECLSPEC mp_limb_t mpn_gcd_1 __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t)) __GMP_ATTRIBUTE_PURE;
 
+#define mpn_gcdext_1 __MPN(gcdext_1)
+__GMP_DECLSPEC mp_limb_t mpn_gcdext_1 __GMP_PROTO ((mp_ptr, mp_ptr, mp_limb_t, mp_limb_t)) __GMP_ATTRIBUTE_PURE;
+
 #define mpn_gcdext __MPN(gcdext)
 __GMP_DECLSPEC mp_size_t mpn_gcdext __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
 
diff --git a/gmp-impl.h b/gmp-impl.h
index 0433e8527..4dcfc6497 100644
--- a/gmp-impl.h
+++ b/gmp-impl.h
@@ -71,6 +71,8 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
   mp_limb_t name __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t))
 #define DECL_gcd_1(name) \
   mp_limb_t name __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t))
+#define DECL_gcdext_1(name) \
+  mp_limb_t name __GMP_PROTO ((mp_ptr, mp_ptr, mp_limb_t, mp_limb_t))
 #define DECL_lshift(name) \
   mp_limb_t name __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, unsigned))
 #define DECL_mod_1(name) \
@@ -3439,176 +3441,156 @@ void __gmp_invalid_operation _PROTO ((void)) ATTRIBUTE_NORETURN;
       }                                                                    \
   } while (0)
 
-
-/* HGCD definitions */
-
-/* Limited by 2 + twice the bitsize of mp_size_t */
-#define QSTACK_MAX_QUOTIENTS 82
-
-/* Name mangling */
-#define qstack_itch __gmpn_qstack_itch
-#define qstack_init __gmpn_qstack_init
-#define qstack_reset __gmpn_qstack_reset
-#define qstack_rotate __gmpn_qstack_rotate
-
-#define mpn_hgcd2 __gmpn_hgcd2
-#define mpn_hgcd2_fix __gmpn_hgcd2_fix
-#define mpn_hgcd2_lehmer_step __gmpn_hgcd2_lehmer_step
-#define mpn_hgcd_max_recursion __gmpn_hgcd_max_recursion
-#define mpn_hgcd_init_itch __gmpn_hgcd_init_itch
-#define mpn_hgcd_init __gmpn_hgcd_init
-#define mpn_hgcd_lehmer_itch __gmpn_hgcd_lehmer_itch
-#define mpn_hgcd_lehmer __gmpn_hgcd_lehmer
-#define mpn_hgcd_itch __gmpn_hgcd_itch
-#define mpn_hgcd __gmpn_hgcd
-#define mpn_hgcd_equal __gmpn_hgcd_equal
-#define mpn_hgcd_fix __gmpn_hgcd_fix
-
-struct qstack
-{
-  /* Throughout the code we represent q = 1 with qsize = 0. */
-  mp_size_t size[QSTACK_MAX_QUOTIENTS];
-  mp_ptr limb;
-  mp_size_t limb_alloc;
-
-  /* Number of quotients to keep when we discard old quotients */
-  unsigned nkeep;
-
-  /* Top quotient is of size size[size_next-1], and starts at
-     limb+limb_next - size[size_next-1]. We use size_next == 0 for an
-     empty stack.*/
-  unsigned size_next;
-  mp_size_t limb_next;
-};
+/* Matrix multiplication */
+#define mpn_matrix22_mul __MPN(matrix22_mul)
+#define mpn_matrix22_strassen __MPN(matrix22_mul_strassen)
+#define mpn_matrix22_mul_itch __MPN(matrix22_mul_itch)
 
 mp_size_t
-qstack_itch __GMP_PROTO ((mp_size_t));
+mpn_matrix22_mul_itch (mp_size_t, mp_size_t);
 
 void
-qstack_init __GMP_PROTO ((struct qstack *, mp_size_t, mp_limb_t *, mp_size_t));
-
+mpn_matrix22_mul (mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t,
+		  mp_srcptr, mp_srcptr, mp_srcptr, mp_srcptr, mp_size_t,
+		  mp_ptr);
 void
-qstack_reset __GMP_PROTO ((struct qstack *, mp_size_t));
+mpn_matrix22_mul_strassen (mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t,
+			   mp_srcptr, mp_srcptr, mp_srcptr, mp_srcptr, mp_size_t,
+			   mp_ptr);
 
-void
-qstack_rotate __GMP_PROTO ((struct qstack *, mp_size_t));
-
-#if WANT_ASSERT
-void
-__gmpn_qstack_sanity __GMP_PROTO ((struct qstack *));
-#define ASSERT_QSTACK __gmpn_qstack_sanity
-#else
-#define ASSERT_QSTACK(stack)
+#ifndef MATRIX22_STRASSEN_THRESHOLD
+#define MATRIX22_STRASSEN_THRESHOLD 30
 #endif
 
-struct hgcd2_row
-{
-  /* r = (-)u a + (-)v b */
-  mp_limb_t u;
-  mp_limb_t v;
-};
+/* HGCD definitions */
+
+/* Extract one numb, shifting count bits left
+    ________  ________
+   |___xh___||___xl___|
+	  |____r____|
+   >count <
+
+   The count includes any nail bits, so it should work fine if count
+   is computed using count_leading_zeros. If GMP_NAIL_BITS > 0, all of
+   xh, xl and r include nail bits. Must have 0 < count < GMP_LIMB_BITS.
 
-struct hgcd2
+   FIXME: Omit masking with GMP_NUMB_MASK, and let callers do that for
+   those calls where the count high bits of xh may be non-zero.
+*/
+
+#define MPN_EXTRACT_NUMB(count, xh, xl)				\
+  ((((xh) << ((count) - GMP_NAIL_BITS)) & GMP_NUMB_MASK) |	\
+   ((xl) >> (GMP_LIMB_BITS - (count))))
+
+#define mpn_hgcd2 __MPN (hgcd2)
+#define mpn_hgcd_mul_matrix1_vector __MPN (hgcd_mul_matrix1_vector)
+#define mpn_hgcd_mul_matrix1_inverse_vector __MPN (hgcd_mul_matrix1_inverse_vector)
+
+#define mpn_hgcd_matrix_init __MPN (hgcd_matrix_init)
+#define mpn_hgcd_matrix_mul __MPN (hgcd_matrix_mul)
+#define mpn_hgcd_matrix_adjust __MPN (hgcd_matrix_adjust)
+
+#define mpn_hgcd_step __MPN (hgcd_step)
+#define mpn_hgcd_itch __MPN (hgcd_itch)
+#define mpn_hgcd __MPN (hgcd)
+#define mpn_hgcd_lehmer __MPN (hgcd_lehmer)
+
+#define mpn_gcd_lehmer_n __MPN(gcd_lehmer_n)
+#define mpn_gcd_subdiv_step __MPN(gcd_subdiv_step)
+#define mpn_gcdext_lehmer_n __MPN(gcdext_lehmer_n)
+#define mpn_gcdext_subdiv_step __MPN(gcdext_subdiv_step)
+
+/* The matrix non-negative M = (u, u'; v,v') keeps track of the
+   reduction (a;b) = M (alpha; beta) where alpha, beta are smaller
+   than a, b. The determinant must always be one, so that M has an
+   inverse (v', -u'; -v, u). Elements always fit in GMP_NUMB_BITS - 1
+   bits. */
+struct hgcd_matrix1
 {
-  /* Sign of the first row, sign >= 0 implies that u >= 0 and v <= 0,
-     sign < 0 implies u <= 0, v >= 0 */
-  int sign;
-  struct hgcd2_row row[4];
+  mp_limb_t u[2][2];
 };
 
 int
-mpn_hgcd2 __GMP_PROTO ((struct hgcd2 *,
-			mp_limb_t, mp_limb_t,
-			mp_limb_t, mp_limb_t,
-			struct qstack *));
+mpn_hgcd2 __GMP_PROTO ((mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t,
+			struct hgcd_matrix1 *));
 
 mp_size_t
-mpn_hgcd2_fix __GMP_PROTO ((mp_ptr, mp_size_t,
-			    int,
-			    mp_limb_t, mp_srcptr, mp_size_t,
-			    mp_limb_t, mp_srcptr, mp_size_t));
-
-int
-mpn_hgcd2_lehmer_step __GMP_PROTO ((struct hgcd2 *,
-				    mp_srcptr, mp_size_t,
-				    mp_srcptr, mp_size_t,
-				    struct qstack *));
-
-unsigned
-mpn_hgcd_max_recursion __GMP_PROTO ((mp_size_t));
+mpn_hgcd_mul_matrix1_vector __GMP_PROTO ((struct hgcd_matrix1 *, mp_size_t,
+					  mp_ptr, mp_ptr, mp_ptr));
 
-struct hgcd_row
-{
-  /* [rp, rsize] should always be normalized. */
-  mp_ptr rp; mp_size_t rsize;
-  mp_ptr uvp[2];
-};
+mp_size_t
+mpn_hgcd_mul_matrix1_inverse_vector __GMP_PROTO ((struct hgcd_matrix1 *, mp_size_t,
+						  mp_ptr, mp_ptr, mp_ptr));
 
-struct hgcd
+struct hgcd_matrix
 {
-  int sign;
-  /* Space allocated for the uv entries, for sanity checking */
+  /* For sanity checking only */
   mp_size_t alloc;
-  /* Size of the largest u,v entry, usually row[3].uvp[1]. This
-     element should be normalized. Smaller elements must be zero
-     padded, and all unused limbs (i.e. between size and alloc) must
-     be zero. */
-  mp_size_t size;
-  struct hgcd_row row[4];
+
+  mp_size_t n;
+  mp_ptr p[2][2];
 };
 
-mp_size_t
-mpn_hgcd_init_itch __GMP_PROTO ((mp_size_t));
+#define MPN_HGCD_MATRIX_INIT_ITCH(n) (4 * ((n+1)/2 + 1))
 
 void
-mpn_hgcd_init __GMP_PROTO ((struct hgcd *,
-			    mp_size_t,
-			    mp_limb_t *));
+mpn_hgcd_matrix_init __GMP_PROTO ((struct hgcd_matrix *, mp_size_t, mp_ptr));
 
+void
+mpn_hgcd_matrix_mul __GMP_PROTO ((struct hgcd_matrix *, const struct hgcd_matrix *,
+				  mp_ptr));
 mp_size_t
-mpn_hgcd_lehmer_itch __GMP_PROTO ((mp_size_t));
-
-int
-mpn_hgcd_lehmer __GMP_PROTO ((struct hgcd *,
-			      mp_srcptr, mp_size_t,
-			      mp_srcptr, mp_size_t,
-			      struct qstack *,
-			      mp_ptr, mp_size_t));
+mpn_hgcd_matrix_adjust __GMP_PROTO ((struct hgcd_matrix *,
+				     mp_size_t, mp_ptr, mp_ptr,
+				     mp_size_t, mp_ptr));
 
 mp_size_t
 mpn_hgcd_itch __GMP_PROTO ((mp_size_t));
 
-int
-mpn_hgcd __GMP_PROTO ((struct hgcd *,
-		       mp_srcptr, mp_size_t,
-		       mp_srcptr, mp_size_t,
-		       struct qstack *,
-		       mp_ptr, mp_size_t));
+mp_size_t
+mpn_hgcd __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t,
+		       struct hgcd_matrix *, mp_ptr));
 
-#if WANT_ASSERT
-void
-__gmpn_hgcd_sanity __GMP_PROTO ((const struct hgcd *,
-				 mp_srcptr, mp_size_t,
-				 mp_srcptr, mp_size_t,
-				 unsigned, unsigned));
-#define ASSERT_HGCD __gmpn_hgcd_sanity
-#else
-#define ASSERT_HGCD(hgcd, ap, asize, bp, bsize, start, end)
-#endif
+#define MPN_HGCD_LEHMER_ITCH(n) (n)
 
-int
-mpn_hgcd_equal __GMP_PROTO ((const struct hgcd *, const struct hgcd *));
+mp_size_t
+mpn_hgcd_lehmer __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t,
+			      struct hgcd_matrix *, mp_ptr));
+
+/* Needs storage for the quotient */
+#define MPN_GCD_SUBDIV_STEP_ITCH(n) (n)
+
+mp_size_t
+mpn_gcd_subdiv_step __GMP_PROTO ((mp_ptr, mp_size_t *,
+				  mp_ptr, mp_ptr, mp_size_t, mp_ptr));
+
+#define MPN_GCD_LEHMER_N_ITCH(n) (n)
+
+mp_size_t
+mpn_gcd_lehmer_n __GMP_PROTO ((mp_ptr, mp_ptr, mp_ptr, mp_size_t,
+			       mp_ptr));
+
+/* To calculate the needed scratch space, n should be a bound for both
+   input and output sizes. */
+#define MPN_GCDEXT_SUBDIV_ITCH(n) (2*(n) + 1)
+
+mp_size_t
+mpn_gcdext_subdiv_step __GMP_PROTO ((mp_ptr, mp_size_t *, mp_ptr, mp_size_t *,
+				     mp_ptr, mp_ptr, mp_size_t,
+				     mp_ptr, mp_ptr, mp_size_t *, mp_ptr));
+
+#define MPN_GCDEXT_LEHMER_N_ITCH(n) (4*(n) + 3)
 
 mp_size_t
-mpn_hgcd_fix __GMP_PROTO ((mp_size_t,
-			   mp_ptr, mp_size_t,
-			   int, mp_size_t,
-			   const struct hgcd_row *,
-			   mp_srcptr, mp_srcptr,
-			   mp_ptr, mp_size_t));
+mpn_gcdext_lehmer_n __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *,
+				  mp_ptr, mp_ptr, mp_size_t,
+				  mp_ptr));
+
+/* 4*(an + 1) + 4*(bn + 1) + an */
+#define MPN_GCDEXT_LEHMER_ITCH(an, bn) (5*(an) + 4*(bn) + 8)
 
-#ifndef HGCD_SCHOENHAGE_THRESHOLD
-#define HGCD_SCHOENHAGE_THRESHOLD 150
+#ifndef HGCD_THRESHOLD
+#define HGCD_THRESHOLD 400
 #endif
 
 #if 0
@@ -3617,12 +3599,12 @@ mpn_hgcd_fix __GMP_PROTO ((mp_size_t,
 #endif
 #endif
 
-#ifndef GCD_SCHOENHAGE_THRESHOLD
-#define GCD_SCHOENHAGE_THRESHOLD 1000
+#ifndef GCD_DC_THRESHOLD
+#define GCD_DC_THRESHOLD 1000
 #endif
 
-#ifndef GCDEXT_SCHOENHAGE_THRESHOLD
-#define GCDEXT_SCHOENHAGE_THRESHOLD 600
+#ifndef GCDEXT_DC_THRESHOLD
+#define GCDEXT_DC_THRESHOLD 600
 #endif
 
 /* Definitions for mpn_set_str and mpn_get_str */
@@ -4044,9 +4026,13 @@ extern mp_size_t                     div_dc_threshold;
 #define POWM_THRESHOLD               powm_threshold
 extern mp_size_t                     powm_threshold;
 
-#undef  HGCD_SCHOENHAGE_THRESHOLD
-#define HGCD_SCHOENHAGE_THRESHOLD    hgcd_schoenhage_threshold
-extern mp_size_t                     hgcd_schoenhage_threshold;
+#undef  MATRIX22_STRASSEN_THRESHOLD
+#define MATRIX22_STRASSEN_THRESHOLD  matrix22_strassen_threshold
+extern mp_size_t                     matrix22_strassen_threshold;
+
+#undef  HGCD_THRESHOLD
+#define HGCD_THRESHOLD    	     hgcd_threshold
+extern mp_size_t                     hgcd_threshold;
 
 #undef  GCD_ACCEL_THRESHOLD
 #define GCD_ACCEL_THRESHOLD          gcd_accel_threshold
@@ -4058,13 +4044,13 @@ extern mp_size_t                     gcd_accel_threshold;
 extern mp_size_t                     gcd_lehmer_threshold;
 #endif
 
-#undef  GCD_SCHOENHAGE_THRESHOLD
-#define GCD_SCHOENHAGE_THRESHOLD     gcd_schoenhage_threshold
-extern mp_size_t                     gcd_schoenhage_threshold;
+#undef  GCD_DC_THRESHOLD
+#define GCD_DC_THRESHOLD             gcd_dc_threshold
+extern mp_size_t                     gcd_dc_threshold;
 
-#undef GCDEXT_SCHOENHAGE_THRESHOLD
-#define GCDEXT_SCHOENHAGE_THRESHOLD  gcdext_schoenhage_threshold
-extern mp_size_t                     gcdext_schoenhage_threshold;
+#undef GCDEXT_DC_THRESHOLD
+#define GCDEXT_DC_THRESHOLD          gcdext_dc_threshold
+extern mp_size_t                     gcdext_dc_threshold;
 
 #undef DIVREM_1_NORM_THRESHOLD
 #define DIVREM_1_NORM_THRESHOLD      divrem_1_norm_threshold
diff --git a/mpn/Makefile.am b/mpn/Makefile.am
index 78f88e24c..d883ec2b8 100644
--- a/mpn/Makefile.am
+++ b/mpn/Makefile.am
@@ -40,7 +40,8 @@ nodist_EXTRA_libmpn_la_SOURCES =					    \
   dump.c fib2_ui.c gcd.c						    \
   gcd_finda.c gcd_1.c gcdext.c get_d.c get_str.c			    \
   hamdist.c hgcd2.c hgcd.c invert_limb.c				    \
-  ior_n.c iorn_n.c jacbase.c lshift.c mod_1.c mod_34lsub1.c mode1o.c	    \
+  ior_n.c iorn_n.c jacbase.c lshift.c \
+  matrix22_mul.c mod_1.c mod_34lsub1.c mode1o.c	    \
   mul.c mul_1.c mul_2.c mul_3.c mul_4.c mul_fft.c mul_n.c mul_basecase.c    \
   mul_toom22.c mul_toom32.c mul_toom42.c				    \
   mullow_n.c mullow_basecase.c nand_n.c nior_n.c perfsqr.c popcount.c	    \
@@ -72,5 +73,7 @@ mp_bases.c:
 perfsqr.h:
 	cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/perfsqr.h
 
+tune-gcd-p: gcd.c
+	$(COMPILE) -DTUNE_GCD_P=1 gcd.c -o tune-gcd-p -L ../.libs -lgmp
 
 include Makeasm.am
diff --git a/mpn/alpha/ev5/gmp-mparam.h b/mpn/alpha/ev5/gmp-mparam.h
index a58805781..9de9c07a2 100644
--- a/mpn/alpha/ev5/gmp-mparam.h
+++ b/mpn/alpha/ev5/gmp-mparam.h
@@ -41,10 +41,10 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 46
 #define POWM_THRESHOLD                   87
 
-#define HGCD_SCHOENHAGE_THRESHOLD        97
+#define HGCD_THRESHOLD                  106
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        566
-#define GCDEXT_SCHOENHAGE_THRESHOLD     322
+#define GCD_DC_THRESHOLD                622
+#define GCDEXT_SCHOENHAGE_THRESHOLD     293
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD           0  /* preinv always */
diff --git a/mpn/alpha/ev6/gmp-mparam.h b/mpn/alpha/ev6/gmp-mparam.h
index 33ea80a54..f259a2278 100644
--- a/mpn/alpha/ev6/gmp-mparam.h
+++ b/mpn/alpha/ev6/gmp-mparam.h
@@ -42,9 +42,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                116
 #define POWM_THRESHOLD                  212
 
-#define HGCD_SCHOENHAGE_THRESHOLD       407
+#define HGCD_THRESHOLD                  407
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        867
+#define GCD_DC_THRESHOLD                867
 #define GCDEXT_SCHOENHAGE_THRESHOLD     867
 #define JACOBI_BASE_METHOD                1
 
diff --git a/mpn/alpha/ev6/nails/gmp-mparam.h b/mpn/alpha/ev6/nails/gmp-mparam.h
index 5d884e3bb..1bc93b52c 100644
--- a/mpn/alpha/ev6/nails/gmp-mparam.h
+++ b/mpn/alpha/ev6/nails/gmp-mparam.h
@@ -34,10 +34,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 48
 #define POWM_THRESHOLD                  113
 
-#define HGCD_SCHOENHAGE_THRESHOLD        78
+#define HGCD_THRESHOLD                   78
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        392
-#define GCDEXT_THRESHOLD                  0  /* always */
+#define GCD_DC_THRESHOLD                392
 #define JACOBI_BASE_METHOD                1
 
 #define DIVREM_1_NORM_THRESHOLD       MP_SIZE_T_MAX  /* no preinv with nails */
diff --git a/mpn/alpha/gmp-mparam.h b/mpn/alpha/gmp-mparam.h
index 138cc5438..37f700494 100644
--- a/mpn/alpha/gmp-mparam.h
+++ b/mpn/alpha/gmp-mparam.h
@@ -41,9 +41,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 38
 #define POWM_THRESHOLD                   53
 
-#define HGCD_SCHOENHAGE_THRESHOLD        63
+#define HGCD_THRESHOLD                   63
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        476
+#define GCD_DC_THRESHOLD                476
 #define GCDEXT_SCHOENHAGE_THRESHOLD     225
 #define JACOBI_BASE_METHOD                2
 
diff --git a/mpn/arm/gmp-mparam.h b/mpn/arm/gmp-mparam.h
index a142605fb..80b6ff8ee 100644
--- a/mpn/arm/gmp-mparam.h
+++ b/mpn/arm/gmp-mparam.h
@@ -37,7 +37,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define POWM_THRESHOLD                  150
 
 #define GCD_ACCEL_THRESHOLD               3
-#define GCDEXT_THRESHOLD                  0
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD           0  /* preinv always */
diff --git a/mpn/cray/gmp-mparam.h b/mpn/cray/gmp-mparam.h
index b7da45c43..72dcb627d 100644
--- a/mpn/cray/gmp-mparam.h
+++ b/mpn/cray/gmp-mparam.h
@@ -41,10 +41,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                996
 #define POWM_THRESHOLD                  601
 
-#define HGCD_SCHOENHAGE_THRESHOLD       964
+#define HGCD_THRESHOLD                  964
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD       2874
-#define GCDEXT_THRESHOLD                  6
+#define GCD_DC_THRESHOLD               2874
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD           0  /* preinv always */
diff --git a/mpn/cray/ieee/gmp-mparam.h b/mpn/cray/ieee/gmp-mparam.h
index d5a866000..03d655c81 100644
--- a/mpn/cray/ieee/gmp-mparam.h
+++ b/mpn/cray/ieee/gmp-mparam.h
@@ -34,10 +34,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                390
 #define POWM_THRESHOLD                  656
 
-#define HGCD_SCHOENHAGE_THRESHOLD       964
+#define HGCD_THRESHOLD                  964
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        964
-#define GCDEXT_THRESHOLD                  0  /* always */
+#define GCD_DC_THRESHOLD                964
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD           0  /* preinv always */
diff --git a/mpn/generic/gcd.c b/mpn/generic/gcd.c
index 30d6969a3..786c328f3 100644
--- a/mpn/generic/gcd.c
+++ b/mpn/generic/gcd.c
@@ -18,852 +18,255 @@ License for more details.
 You should have received a copy of the GNU Lesser General Public License
 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
-/* Integer greatest common divisor of two unsigned integers, using
-   the accelerated algorithm (see reference below).
-
-   mp_size_t mpn_gcd (up, usize, vp, vsize).
-
-   Preconditions [U = (up, usize) and V = (vp, vsize)]:
-
-   1.  V is odd.
-   2.  numbits(U) >= numbits(V).
-
-   Both U and V are destroyed by the operation.  The result is left at vp,
-   and its size is returned.
-
-   Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
-
-   Funding for this work has been partially provided by Conselho Nacional
-   de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
-   301314194-2, and was done while I was a visiting reseacher in the Instituto
-   de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
-
-   Refer to
-	K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
-	Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
-
-#include <stdio.h>  /* for NULL */
-
 #include "gmp.h"
 #include "gmp-impl.h"
 #include "longlong.h"
 
-
-/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
-   algorithm is used, otherwise the binary algorithm is used.  This may be
-   adjusted for different architectures.  */
-#ifndef GCD_ACCEL_THRESHOLD
-#define GCD_ACCEL_THRESHOLD 5
-#endif
-
-/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
-   algorithm reduces using the bmod operation.  Otherwise, the k-ary reduction
-   is used.  0 <= BMOD_THRESHOLD < GMP_NUMB_BITS.  */
-enum
-  {
-    BMOD_THRESHOLD = GMP_NUMB_BITS/2
-  };
-
-
-/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
-   Both U and V must be odd.  */
-static inline mp_size_t
-gcd_2 (mp_ptr vp, mp_srcptr up)
+static inline int
+mpn_zero_p (mp_srcptr ap, mp_size_t n)
 {
-  mp_limb_t u0, u1, v0, v1;
-  mp_size_t vsize;
-
-  u0 = up[0];
-  u1 = up[1];
-  v0 = vp[0];
-  v1 = vp[1];
-
-  while (u1 != v1 && u0 != v0)
+  mp_size_t i;
+  for (i = n - 1; i >= 0; i--)
     {
-      unsigned long int r;
-      if (u1 > v1)
-	{
-	  u1 -= v1 + (u0 < v0);
-	  u0 = (u0 - v0) & GMP_NUMB_MASK;
-	  count_trailing_zeros (r, u0);
-	  u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r);
-	  u1 >>= r;
-	}
-      else  /* u1 < v1.  */
-	{
-	  v1 -= u1 + (v0 < u0);
-	  v0 = (v0 - u0) & GMP_NUMB_MASK;
-	  count_trailing_zeros (r, v0);
-	  v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r);
-	  v1 >>= r;
-	}
+      if (ap[i] != 0)
+	return 0;
     }
-
-  vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
-
-  /* If U == V == GCD, done.  Otherwise, compute GCD (V, |U - V|).  */
-  if (u1 == v1 && u0 == v0)
-    return vsize;
-
-  v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
-  vp[0] = mpn_gcd_1 (vp, vsize, v0);
-
   return 1;
 }
 
-/* The function find_a finds 0 < N < 2^GMP_NUMB_BITS such that there exists
-   0 < |D| < 2^GMP_NUMB_BITS, and N == D * C mod 2^(2*GMP_NUMB_BITS).
-   In the reference article, D was computed along with N, but it is better to
-   compute D separately as D <-- N / C mod 2^(GMP_NUMB_BITS + 1), treating
-   the result as a twos' complement signed integer.
-
-   Initialize N1 to C mod 2^(2*GMP_NUMB_BITS).  According to the reference
-   article, N2 should be initialized to 2^(2*GMP_NUMB_BITS), but we use
-   2^(2*GMP_NUMB_BITS) - N1 to start the calculations within double
-   precision.  If N2 > N1 initially, the first iteration of the while loop
-   will swap them.  In all other situations, N1 >= N2 is maintained.  */
-
-#if HAVE_NATIVE_mpn_gcd_finda
-#define find_a(cp)  mpn_gcd_finda (cp)
+/* Uses the HGCD operation described in
+
+     N. M�ller, On Sch�nhage's algorithm and subquadratic integer gcd
+     computation, Math. Comp. 77 (2008), 589-607.
+
+  to reduce inputs until they are of size below GCD_DC_THRESHOLD, and
+  then uses Lehmer's algorithm.
+*/
+
+/* Some reasonable choices are n / 2 (same as in hgcd), and p = (n +
+ * 2)/3, which gives a balanced multiplication in
+ * mpn_hgcd_matrix_adjust. However, p = 2 n/3 gives slightly better
+ * performance. The matrix-vector multiplication is then
+ * 4:1-unbalanced, with matrix elements of size n/6, and vector
+ * elements of size p = 2n/3. */
+
+/* From analysis of the theoretical running time, it appears that when
+ * multiplication takes time O(n^alpha), p should be choosen so that
+ * the ratio of the time for the mpn_hgcd call, and the time for the
+ * multiplication in mpn_hgcd_matrix_adjust, is roughly 1/(alpha -
+ * 1). */
+#ifdef TUNE_GCD_P
+#define P_TABLE_SIZE 10000
+mp_size_t p_table[P_TABLE_SIZE];
+#define CHOOSE_P(n) ( (n) < P_TABLE_SIZE ? p_table[n] : 2*(n)/3)
 #else
-static
-#if ! defined (__i386__)
-inline				/* don't inline this for the x86 */
+#define CHOOSE_P(n) (2*(n) / 3)
 #endif
-mp_limb_t
-find_a (mp_srcptr cp)
-{
-  unsigned long int leading_zero_bits = 0;
 
-  mp_limb_t n1_l = cp[0];	/* N1 == n1_h * 2^GMP_NUMB_BITS + n1_l.  */
-  mp_limb_t n1_h = cp[1];
-
-  mp_limb_t n2_l = (-n1_l & GMP_NUMB_MASK);	/* N2 == n2_h * 2^GMP_NUMB_BITS + n2_l.  */
-  mp_limb_t n2_h = (~n1_h & GMP_NUMB_MASK);
-
-  /* Main loop.  */
-  while (n2_h != 0)		/* While N2 >= 2^GMP_NUMB_BITS.  */
-    {
-      /* N1 <-- N1 % N2.  */
-      if (((GMP_NUMB_HIGHBIT >> leading_zero_bits) & n2_h) == 0)
-	{
-	  unsigned long int i;
-	  count_leading_zeros (i, n2_h);
-	  i -= GMP_NAIL_BITS;
-	  i -= leading_zero_bits;
-	  leading_zero_bits += i;
-	  n2_h = ((n2_h << i) & GMP_NUMB_MASK) | (n2_l >> (GMP_NUMB_BITS - i));
-	  n2_l = (n2_l << i) & GMP_NUMB_MASK;
-	  do
-	    {
-	      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
-		{
-		  n1_h -= n2_h + (n1_l < n2_l);
-		  n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
-		}
-	      n2_l = (n2_l >> 1) | ((n2_h << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK);
-	      n2_h >>= 1;
-	      i -= 1;
-	    }
-	  while (i != 0);
-	}
-      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
-	{
-	  n1_h -= n2_h + (n1_l < n2_l);
-	  n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
-	}
-
-      MP_LIMB_T_SWAP (n1_h, n2_h);
-      MP_LIMB_T_SWAP (n1_l, n2_l);
-    }
-
-  return n2_l;
-}
-#endif
-
-/* v must be odd */
-static mp_size_t
-gcd_binary_odd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
+mp_size_t
+mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t n)
 {
-  mp_ptr orig_vp = vp;
-  mp_size_t orig_vsize = vsize;
-  int binary_gcd_ctr;		/* Number of times binary gcd will execute.  */
+  mp_size_t talloc;
+  mp_size_t scratch;
+  mp_size_t matrix_scratch;
+  
+  mp_size_t gn;
+  mp_ptr tp;
   TMP_DECL;
 
-  ASSERT (usize >= 1);
-  ASSERT (vsize >= 1);
-  ASSERT (usize >= vsize);
-  ASSERT (vp[0] & 1);
-  ASSERT (up[usize - 1] != 0);
-  ASSERT (vp[vsize - 1] != 0);
-#if WANT_ASSERT
-  if (usize == vsize)
+  /* FIXME: Check for small sizes first, before setting up temporary
+     storage etc. */
+  talloc = MPN_GCD_LEHMER_N_ITCH(n);
+  
+  /* For initial division */
+  scratch = usize - n + 1;
+  if (scratch > talloc)
+    talloc = scratch;
+
+#if TUNE_GCD_P
+  if (CHOOSE_P (n) > 0)
+#else
+  if (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD))
+#endif
     {
-      int  uzeros, vzeros;
-      count_leading_zeros (uzeros, up[usize - 1]);
-      count_leading_zeros (vzeros, vp[vsize - 1]);
-      ASSERT (uzeros <= vzeros);
-    }
+      mp_size_t hgcd_scratch;
+      mp_size_t update_scratch;
+      mp_size_t p = CHOOSE_P (n);
+      mp_size_t scratch;
+#if TUNE_GCD_P
+      /* Worst case, since we don't guarantee that n - CHOOSE_P(n)
+	 is increasing */
+      matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n);
+      hgcd_scratch = mpn_hgcd_itch (n);
+      update_scratch = 2*(n - 1);
+#else
+      matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p);
+      hgcd_scratch = mpn_hgcd_itch (n - p);
+      update_scratch = p + n - 1;
 #endif
-  ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize));
-  ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, up, usize));
-  ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, vp, vsize));
+      scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
+      if (scratch > talloc)
+	talloc = scratch;
+    }
 
   TMP_MARK;
+  tp = TMP_ALLOC_LIMBS(talloc);
 
-  /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
-     Two EXTRA limbs for U and V are required for kary reduction.  */
-  if (vsize >= GCD_ACCEL_THRESHOLD)
+  if (usize > n)
     {
-      unsigned long int vbitsize, d;
-      mp_ptr orig_up = up;
-      mp_size_t orig_usize = usize;
-      mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
-
-      MPN_COPY (anchor_up, orig_up, usize);
-      up = anchor_up;
-
-      count_leading_zeros (d, up[usize - 1]);
-      d -= GMP_NAIL_BITS;
-      d = usize * GMP_NUMB_BITS - d;
-      count_leading_zeros (vbitsize, vp[vsize - 1]);
-      vbitsize -= GMP_NAIL_BITS;
-      vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
-      ASSERT (d >= vbitsize);
-      d = d - vbitsize + 1;
-
-      /* Use bmod reduction to quickly discover whether V divides U.  */
-      up[usize++] = 0;				/* Insert leading zero.  */
-      mpn_bdivmod (up, up, usize, vp, vsize, d);
-
-      /* Now skip U/V mod 2^d and any low zero limbs.  */
-      d /= GMP_NUMB_BITS, up += d, usize -= d;
-      while (usize != 0 && up[0] == 0)
-	up++, usize--;
-
-      if (usize == 0)				/* GCD == ORIG_V.  */
-	goto done;
-
-      vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
-      MPN_COPY (vp, orig_vp, vsize);
-
-      do					/* Main loop.  */
-	{
-	  /* mpn_com_n can't be used here because anchor_up and up may
-	     partially overlap */
-	  if ((up[usize - 1] & GMP_NUMB_HIGHBIT) != 0)  /* U < 0; take twos' compl. */
-	    {
-	      mp_size_t i;
-	      anchor_up[0] = -up[0] & GMP_NUMB_MASK;
-	      for (i = 1; i < usize; i++)
-		anchor_up[i] = (~up[i] & GMP_NUMB_MASK);
-	      up = anchor_up;
-	    }
-
-	  MPN_NORMALIZE_NOT_ZERO (up, usize);
-
-	  if ((up[0] & 1) == 0)			/* Result even; remove twos. */
-	    {
-	      unsigned int r;
-	      count_trailing_zeros (r, up[0]);
-	      mpn_rshift (anchor_up, up, usize, r);
-	      usize -= (anchor_up[usize - 1] == 0);
-	    }
-	  else if (anchor_up != up)
-	    MPN_COPY_INCR (anchor_up, up, usize);
-
-	  MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
-	  up = anchor_up;
-
-	  if (vsize <= 2)		/* Kary can't handle < 2 limbs and  */
-	    break;			/* isn't efficient for == 2 limbs.  */
+      mpn_tdiv_qr (tp, up, 0, up, usize, vp, n);
 
-	  d = vbitsize;
-	  count_leading_zeros (vbitsize, vp[vsize - 1]);
-	  vbitsize -= GMP_NAIL_BITS;
-	  vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
-	  d = d - vbitsize + 1;
-
-	  if (d > BMOD_THRESHOLD)	/* Bmod reduction.  */
-	    {
-	      up[usize++] = 0;
-	      mpn_bdivmod (up, up, usize, vp, vsize, d);
-	      d /= GMP_NUMB_BITS, up += d, usize -= d;
-	    }
-	  else				/* Kary reduction.  */
-	    {
-	      mp_limb_t bp[2], cp[2];
-
-	      /* C <-- V/U mod 2^(2*GMP_NUMB_BITS).  */
-	      {
-		mp_limb_t u_inv, hi, lo;
-		modlimb_invert (u_inv, up[0]);
-		cp[0] = (vp[0] * u_inv) & GMP_NUMB_MASK;
-		umul_ppmm (hi, lo, cp[0], up[0] << GMP_NAIL_BITS);
-		lo >>= GMP_NAIL_BITS;
-		cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv & GMP_NUMB_MASK;
-	      }
-
-	      /* U <-- find_a (C)  *  U.  */
-	      up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
-	      usize++;
-
-	      /* B <-- A/C == U/V mod 2^(GMP_NUMB_BITS + 1).
-		  bp[0] <-- U/V mod 2^GMP_NUMB_BITS and
-		  bp[1] <-- ( (U - bp[0] * V)/2^GMP_NUMB_BITS ) / V mod 2
-
-		  Like V/U above, but simplified because only the low bit of
-		  bp[1] is wanted. */
-	      {
-		mp_limb_t  v_inv, hi, lo;
-		modlimb_invert (v_inv, vp[0]);
-		bp[0] = (up[0] * v_inv) & GMP_NUMB_MASK;
-		umul_ppmm (hi, lo, bp[0], vp[0] << GMP_NAIL_BITS);
-		lo >>= GMP_NAIL_BITS;
-		bp[1] = (up[1] + hi + (bp[0] & vp[1])) & 1;
-	      }
-
-	      up[usize++] = 0;
-	      if (bp[1] != 0)	/* B < 0: U <-- U + (-B)  * V.  */
-		{
-		   mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0] & GMP_NUMB_MASK);
-		   mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
-		}
-	      else		/* B >= 0:  U <-- U - B * V.  */
-		{
-		  mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
-		  mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
-		}
-
-	      up += 2, usize -= 2;  /* At least two low limbs are zero.  */
-	    }
-
-	  /* Must remove low zero limbs before complementing.  */
-	  while (usize != 0 && up[0] == 0)
-	    up++, usize--;
+      if (mpn_zero_p (up, n))
+	{
+	  MPN_COPY (gp, vp, n);
+	  TMP_FREE;
+	  return n;
 	}
-      while (usize != 0);
-
-      /* Compute GCD (ORIG_V, GCD (ORIG_U, V)).  Binary will execute twice.  */
-      up = orig_up, usize = orig_usize;
-      binary_gcd_ctr = 2;
     }
-  else
-    binary_gcd_ctr = 1;
 
-  /* Finish up with the binary algorithm.  Executes once or twice.  */
-  for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
+#if TUNE_GCD_P
+  while (CHOOSE_P (n) > 0)
+#else
+  while (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD))
+#endif
     {
-      if (usize > 2)		/* First make U close to V in size.  */
+      struct hgcd_matrix M;
+      mp_size_t p = CHOOSE_P (n);
+      mp_size_t matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p);
+      mp_size_t nn;
+      mpn_hgcd_matrix_init (&M, n - p, tp);
+      nn = mpn_hgcd (up + p, vp + p, n - p, &M, tp + matrix_scratch);
+      if (nn > 0)
 	{
-	  unsigned long int vbitsize, d;
-	  count_leading_zeros (d, up[usize - 1]);
-	  d -= GMP_NAIL_BITS;
-	  d = usize * GMP_NUMB_BITS - d;
-	  count_leading_zeros (vbitsize, vp[vsize - 1]);
-	  vbitsize -= GMP_NAIL_BITS;
-	  vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
-	  d = d - vbitsize - 1;
-	  if (d != -(unsigned long int)1 && d > 2)
-	    {
-	      mpn_bdivmod (up, up, usize, vp, vsize, d);  /* Result > 0.  */
-	      d /= (unsigned long int)GMP_NUMB_BITS, up += d, usize -= d;
-	    }
+	  ASSERT (M.n <= (n - p - 1)/2);
+	  ASSERT (M.n + p <= (p + n - 1) / 2);
+	  /* Temporary storage 2 (p + M->n) <= p + n - 1. */
+	  n = mpn_hgcd_matrix_adjust (&M, p + nn, up, vp, p, tp + matrix_scratch);
 	}
-
-      /* Start binary GCD.  */
-      do
+      else
 	{
-	  mp_size_t zeros;
-
-	  /* Make sure U is odd.  */
-	  MPN_NORMALIZE (up, usize);
-	  while (up[0] == 0)
-	    up += 1, usize -= 1;
-	  if ((up[0] & 1) == 0)
-	    {
-	      unsigned int r;
-	      count_trailing_zeros (r, up[0]);
-	      mpn_rshift (up, up, usize, r);
-	      usize -= (up[usize - 1] == 0);
-	    }
-
-	  /* Keep usize >= vsize.  */
-	  if (usize < vsize)
-	    MPN_PTR_SWAP (up, usize, vp, vsize);
-
-	  if (usize <= 2)				/* Double precision. */
-	    {
-	      if (vsize == 1)
-		vp[0] = mpn_gcd_1 (up, usize, vp[0]);
-	      else
-		vsize = gcd_2 (vp, up);
-	      break;					/* Binary GCD done.  */
-	    }
-
-	  /* Count number of low zero limbs of U - V.  */
-	  for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
-	    continue;
-
-	  /* If U < V, swap U and V; in any case, subtract V from U.  */
-	  if (zeros == vsize)				/* Subtract done.  */
-	    up += zeros, usize -= zeros;
-	  else if (usize == vsize)
+	  /* Temporary storage n */
+	  n = mpn_gcd_subdiv_step (gp, &gn, up, vp, n, tp);
+	  if (n == 0)
 	    {
-	      mp_size_t size = vsize;
-	      do
-		size--;
-	      while (up[size] == vp[size]);
-	      if (up[size] < vp[size])			/* usize == vsize.  */
-		MP_PTR_SWAP (up, vp);
-	      up += zeros, usize = size + 1 - zeros;
-	      mpn_sub_n (up, up, vp + zeros, usize);
-	    }
-	  else
-	    {
-	      mp_size_t size = vsize - zeros;
-	      up += zeros, usize -= zeros;
-	      if (mpn_sub_n (up, up, vp + zeros, size))
-		{
-		  while (up[size] == 0)			/* Propagate borrow. */
-		    up[size++] = -(mp_limb_t)1;
-		  up[size] -= 1;
-		}
+	      TMP_FREE;
+	      return gn;
 	    }
 	}
-      while (usize);					/* End binary GCD.  */
     }
 
-done:
-  if (vp != gp)
-    MPN_COPY_INCR (gp, vp, vsize);
+  gn = mpn_gcd_lehmer_n (gp, up, vp, n, tp);
   TMP_FREE;
-  return vsize;
+  return gn;
 }
 
-#define EVEN_P(x) (((x) & 1) == 0)
-
-/* Allows an even v */
-static mp_size_t
-gcd_binary (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
+#ifdef TUNE_GCD_P
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#define TIME(res, code) do {						\
+    clock_t time_start;							\
+    clock_t time_end;							\
+    clock_t time_end_time;						\
+    unsigned time_iter = 0;						\
+									\
+    time_start = clock();						\
+    time_end_time = time_start + CLOCKS_PER_SEC / 100;			\
+    do									\
+      {									\
+	code;								\
+	time_end = clock();						\
+	time_iter++;							\
+      }									\
+    while (time_end <= time_end_time);					\
+									\
+    (res) = (double) (time_end - time_start) / (CLOCKS_PER_SEC * time_iter); \
+  } while (0)
+
+int
+main(int argc, char *argv)
 {
-  mp_size_t zero_words = 0;
-  mp_size_t gsize;
-  unsigned shift = 0;
-
-  ASSERT (usize > 0);
-  ASSERT (vsize > 0);
+  gmp_randstate_t rands;
+  mp_size_t n;
+  mp_ptr ap;
+  mp_ptr bp;
+  mp_ptr up;
+  mp_ptr vp;
+  mp_ptr gp;
+  mp_ptr tp;
+  TMP_DECL;
 
-  if (up[0] == 0 && vp[0] == 0)
-    {
-      do
-	gp[zero_words++] = 0;
-      while (up[zero_words] == 0 && vp[zero_words] == 0);
+  /* Unbuffered so if output is redirected to a file it isn't lost if the
+     program is killed part way through.  */
+  setbuf (stdout, NULL);
+  setbuf (stderr, NULL);
 
-      up += zero_words; usize -= zero_words;
-      vp += zero_words; vsize -= zero_words;
-      gp += zero_words;
-    }
+  gmp_randinit_default (rands);
 
-  /* Now u and v can have a common power of two < 2^GMP_NUMB_BITS */
-  if (up[0] == 0)
-    {
-      ASSERT (vp[0] != 0);
-      if (EVEN_P (vp[0]))
-	{
-	  count_trailing_zeros (shift, vp[0]);
-	  ASSERT (shift > 0);
-	  ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, shift));
-	  if (vp[vsize - 1] == 0)
-	    vsize--;
-	}
-    }
-  else if (vp[0] == 0)
-    {
-      if (EVEN_P (up[0]))
-	{
-	  count_trailing_zeros (shift, up[0]);
-	  ASSERT (shift > 0);
-	}
-      while (vp[0] == 0)
-	{
-	  vp++;
-	  vsize--;
-	}
-
-      if (EVEN_P (vp[0]))
-	{
-	  unsigned vcount;
-
-	  count_trailing_zeros (vcount, vp[0]);
-	  ASSERT (vcount > 0);
-	  ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, vcount));
-	  if (vp[vsize - 1] == 0)
-	    vsize--;
-	}
-    }
-  else if (EVEN_P (vp[0]))
-    {
-      unsigned vcount;
-      count_trailing_zeros (vcount, vp[0]);
-      ASSERT (vcount > 0);
-      ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, vcount));
-      if (vp[vsize - 1] == 0)
-	vsize--;
-
-      if (EVEN_P (up[0]))
-	{
-	  unsigned ucount;
-	  count_trailing_zeros (ucount, up[0]);
-	  ASSERT (ucount > 0);
-	  shift = MIN (ucount, vcount);
-	}
-    }
+  TMP_MARK;
 
-  gsize = gcd_binary_odd (gp, up, usize, vp, vsize);
-  if (shift)
+  ap = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+  bp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+  up = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+  vp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+  gp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+  tp = TMP_ALLOC_LIMBS (MPN_GCD_LEHMER_N_ITCH (P_TABLE_SIZE));
+
+  mpn_random (ap, P_TABLE_SIZE);
+  mpn_random (bp, P_TABLE_SIZE);
+  
+  memset (p_table, 0, sizeof(p_table));
+  
+  for (n = 10; n++; n < P_TABLE_SIZE)
     {
-      mp_limb_t cy = mpn_lshift (gp, gp, gsize, shift);
-      if (cy)
-	gp[gsize++] = cy;
-    }
-  return gsize + zero_words;
-}
-
-#define MPN_LEQ_P(ap, asize, bp, bsize)				\
-((asize) < (bsize) || ((asize) == (bsize)			\
-		       && mpn_cmp ((ap), (bp), (asize)) <= 0))
-
-/* Sets (a, b, c, d)  <--  (c, d, a, b) */
-#define NHGCD_SWAP4_2(row)			\
-do {						\
-  struct hgcd_row __nhgcd_swap4_2_tmp;          \
-  __nhgcd_swap4_2_tmp = row[0];                 \
-  row[0] = row[2];				\
-  row[2] = __nhgcd_swap4_2_tmp;			\
-  __nhgcd_swap4_2_tmp = row[1];			\
-  row[1] = row[3];				\
-  row[3] = __nhgcd_swap4_2_tmp;			\
-} while (0)
-
-/* Sets (a, b, c)  <--  (b, c, a) */
-#define NHGCD_SWAP3_LEFT(row)				\
-do {							\
-  struct hgcd_row __nhgcd_swap4_left_tmp;               \
-  __nhgcd_swap4_left_tmp = row[0];                      \
-  row[0] = row[1];					\
-  row[1] = row[2];					\
-  row[2] = __nhgcd_swap4_left_tmp;			\
-} while (0)
-
-static mp_size_t
-hgcd_tdiv (mp_ptr qp,
-	   mp_ptr rp, mp_size_t *rsizep,
-	   mp_srcptr ap, mp_size_t asize,
-	   mp_srcptr bp, mp_size_t bsize)
-{
-  mp_size_t qsize;
-  mp_size_t rsize;
+      mp_size_t p;
+      mp_size_t best_p;
+      double best_time;
+      double lehmer_time;
 
-  mpn_tdiv_qr (qp, rp, 0, ap, asize, bp, bsize);
+      if (ap[n-1] == 0)
+	ap[n-1] = 1;
 
-  rsize = bsize;
-  MPN_NORMALIZE (rp, rsize);
-  *rsizep = rsize;
+      if (bp[n-1] == 0)
+	bp[n-1] = 1;
 
-  qsize = asize - bsize + 1;
-  qsize -= (qp[qsize - 1] == 0);
+      p_table[n] = 0;
+      TIME(lehmer_time, {
+	  MPN_COPY (up, ap, n);
+	  MPN_COPY (vp, bp, n);
+	  mpn_gcd_lehmer_n (gp, up, vp, n, tp);
+	});
 
-  if (qsize == 1 && qp[0] == 1)
-    return 0;
-
-  return qsize;
-}
-
-
-#if 0
-#define GCD_LEHMER_ITCH(asize) (5*((asize) + 1))
-
-static mp_size_t
-gcd_lehmer (mp_ptr gp, mp_srcptr ap, mp_size_t asize,
-	    mp_srcptr bp, mp_size_t bsize,
-	    mp_ptr tp, mp_size_t talloc)
-{
-  struct hgcd_row r[4];
-  mp_ptr qp;
-  mp_size_t qsize;
-  mp_size_t ralloc = asize + 1;
+      best_time = lehmer_time;
+      best_p = 0;
 
-  ASSERT (asize >= bsize);
-  ASSERT (bsize > 0);
-
-#if 0
-  if (BELOW_THRESHOLD (asize, MPN_GCD_LEHMER_THRESHOLD))
-    {
-      ASSERT (asize + bsize + 2 <= talloc);
-
-      MPN_COPY (tp, ap, asize);
-      MPN_COPY (tp + asize + 1, bp, bsize);
-      return nhgcd_gcd_binary (gp, tp, asize, tp + asize + 1, bsize);
-    }
-#endif
-
-  ASSERT (MPN_LEQ_P (bp, bsize, ap, asize));
-  ASSERT (5 * asize  + 4 <= talloc);
-
-  r[0].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[1].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[2].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[3].rp = tp; tp += ralloc; talloc -= ralloc;
-  qp = tp; tp += asize; talloc -= asize;
-
-  MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize;
-  MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize;
-
-#if 0
-  /* u and v fields aren't used, but zero them out so that we can call
-     trace_nhgcd_row */
-  r[0].uvp[0] = r[0].uvp[1] = NULL;
-  r[1].uvp[0] = r[1].uvp[1] = NULL;
-  r[2].uvp[0] = r[2].uvp[1] = NULL;
-  r[3].uvp[0] = r[3].uvp[1] = NULL;
-#endif
-
-  while (ABOVE_THRESHOLD (r[0].rsize, GCD_LEHMER_THRESHOLD) && r[1].rsize > 0)
-    {
-      struct hgcd2 hgcd;
-      int res = mpn_hgcd2_lehmer_step (&hgcd,
-				       r[0].rp, r[0].rsize,
-				       r[1].rp, r[1].rsize,
-				       NULL);
-
-      if (!res || (res == 2 && hgcd.row[0].v == 0))
+      for (p = 1; p < n; p += (n+9)/10)
 	{
-	  qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize,
-			     r[0].rp, r[0].rsize,
-			     r[1].rp, r[1].rsize);
-	  NHGCD_SWAP3_LEFT (r);
-	}
-      else
-	{
-	  const struct hgcd2_row *s = hgcd.row + (res - 2);
-	  int sign = hgcd.sign;
-	  if (res == 3)
-	    sign = ~sign;
-
-	  /* s[0] and s[1] correct. */
-	  r[2].rsize
-	    = mpn_hgcd2_fix (r[2].rp, ralloc,
-			     sign,
-			     s[0].u, r[0].rp, r[0].rsize,
-			     s[0].v, r[1].rp, r[1].rsize);
-
-	  r[3].rsize
-	    = mpn_hgcd2_fix (r[3].rp, ralloc,
-			     ~sign,
-			     s[1].u, r[0].rp, r[0].rsize,
-			     s[1].v, r[1].rp, r[1].rsize);
-
-	  NHGCD_SWAP4_2 (r);
-	}
-    }
-
-  if (r[1].rsize == 0)
-    {
-      MPN_COPY (gp, r[0].rp, r[0].rsize);
-      return r[0].rsize;
-    }
+	  double t;
 
-  return gcd_binary (gp, r[0].rp, r[0].rsize, r[1].rp, r[1].rsize);
-}
-#endif
-
-static mp_size_t
-gcd_schoenhage_itch (mp_size_t asize)
-{
-  /* Size for hgcd calls */
-  mp_size_t ralloc = asize + 1;
-  mp_size_t hgcd_size = (asize + 1) / 2;
-  return (4 * ralloc				/* Remainder storage */
-	  + mpn_hgcd_init_itch (hgcd_size)	/* hgcd storage */
-	  + qstack_itch (hgcd_size)
-	  + mpn_hgcd_itch (hgcd_size)		/* nhgcd call */
-	  + 1+ 3 * asize / 4);			/* hgcd_fix */
-}
+	  p_table[n] = p;
+	  TIME(t, {
+	      MPN_COPY (up, ap, n);
+	      MPN_COPY (vp, bp, n);
+	      mpn_gcd (gp, up, n, vp, n);
+	    });
 
-static mp_size_t
-gcd_schoenhage (mp_ptr gp, mp_srcptr ap, mp_size_t asize,
-		mp_srcptr bp, mp_size_t bsize,
-		mp_ptr tp, mp_size_t talloc)
-{
-  mp_size_t scratch;
-  struct hgcd hgcd;
-  struct qstack quotients;
-  struct hgcd_row r[4];
-
-  mp_size_t ralloc = asize + 1;
-
-  ASSERT (asize >= bsize);
-  ASSERT (bsize > 0);
-
-  ASSERT (MPN_LEQ_P (bp, bsize, ap, asize));
-
-  ASSERT (4 * ralloc <= talloc);
-  tp += ralloc; talloc -= ralloc;
-  r[0].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[1].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[2].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[3].rp = tp; tp += ralloc; talloc -= ralloc;
-
-  MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize;
-  MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize;
-
-#if 0
-  /* We don't use the u and v fields, but zero them out so that we can
-     call trace_nhgcd_row while debugging. */
-  r[0].uvp[0] = r[0].uvp[1] = NULL;
-  r[1].uvp[0] = r[1].uvp[1] = NULL;
-  r[2].uvp[0] = r[2].uvp[1] = NULL;
-  r[3].uvp[0] = r[3].uvp[1] = NULL;
-#endif
-
-  scratch = mpn_hgcd_init_itch ((asize + 1)/2);
-  ASSERT (scratch <= talloc);
-  mpn_hgcd_init (&hgcd, (asize + 1)/2, tp);
-  tp += scratch; talloc -= scratch;
-
-  {
-    mp_size_t nlimbs = qstack_itch ((asize + 1)/2);
-
-    ASSERT (nlimbs <= talloc);
-
-    qstack_init (&quotients, (asize + 1) / 2, tp, nlimbs);
-
-    tp += nlimbs;
-    talloc -= nlimbs;
-  }
-
-  while (ABOVE_THRESHOLD (r[0].rsize, GCD_SCHOENHAGE_THRESHOLD)
-         && r[1].rsize > 0)
-    {
-      mp_size_t k = r[0].rsize / 2;
-      int res;
-
-#if 0
-      trace ("nhgcd_gcd_schoenhage\n");
-      trace_nhgcd_row (r);
-      trace_nhgcd_row (r + 1);
-#endif
-      if (r[1].rsize <= k)
-	goto euclid;
-
-      qstack_reset (&quotients, r[0].rsize - k);
-
-      res = mpn_hgcd (&hgcd,
-		      r[0].rp + k, r[0].rsize - k,
-		      r[1].rp + k, r[1].rsize - k,
-		      &quotients,
-		      tp, talloc);
-
-      if (res == 0 || res == 1)
-	{
-	euclid:
-	  ASSERT (r[0].rsize - r[1].rsize + 1 <= talloc);
-	  hgcd_tdiv (tp, r[2].rp, &r[2].rsize,
-		     r[0].rp, r[0].rsize,
-		     r[1].rp, r[1].rsize);
-
-	  NHGCD_SWAP3_LEFT (r);
-	}
-      else
-	{
-	  const struct hgcd_row *s = hgcd.row + (res - 2);
-	  int sign = hgcd.sign;
-	  if (res == 3)
-	    sign = ~sign;
-
-	  /* s[0] and s[1] are correct */
-	  r[2].rsize
-	    = mpn_hgcd_fix (k, r[2].rp, ralloc,
-			    sign, hgcd.size, s,
-			    r[0].rp, r[1].rp,
-			    tp, talloc);
-
-	  r[3].rsize
-	    = mpn_hgcd_fix (k, r[3].rp, ralloc,
-			    ~sign, hgcd.size, s+1,
-			    r[0].rp, r[1].rp,
-			    tp, talloc);
-
-	  NHGCD_SWAP4_2 (r);
+	  if (t < best_time)
+	    {
+	      best_time = t;
+	      best_p = p;
+	    }
 	}
-    }
+      printf("%6d %6d %5.3g", n, best_p, (double) best_p / n);
+      if (best_p > 0)
+	printf(" %5.3g%%", 100 * (lehmer_time - best_time) / lehmer_time);
+      printf("\n");
 
-#if 0
-  trace ("nhgcd_gcd_schoenhage after loop\n");
-  trace_nhgcd_row (r);
-  trace_nhgcd_row (r + 1);
-#endif
-
-  if (r[1].rsize == 0)
-    {
-      MPN_COPY (gp, r[0].rp, r[0].rsize);
-      return r[0].rsize;
-    }
-#if 0
-  else if (ABOVE_THRESHOLD (r[0].rsize, GCD_LEHMER_THRESHOLD))
-    return gcd_lehmer (gp,
-		       r[0].rp, r[0].rsize,
-		       r[1].rp, r[1].rsize,
-		       tp, talloc);
-#endif
-  else
-    return gcd_binary (gp,
-		       r[0].rp, r[0].rsize,
-		       r[1].rp, r[1].rsize);
-}
-
-/* Should we perform an initial division? */
-mp_size_t
-mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
-{
-  if (BELOW_THRESHOLD (usize, GCD_SCHOENHAGE_THRESHOLD))
-    return gcd_binary_odd (gp, up, usize, vp, vsize);
-
-  /* The algorithms below require U >= V, while mpn_gcd is long documented as
-     requiring only that the position of U's msb >= V's msb.  */
-  if (usize == vsize && mpn_cmp (up, vp, usize) < 0)
-    MP_PTR_SWAP (up, vp);
-
-#if 0
-  if (BELOW_THRESHOLD (usize, GCD_SCHOENHAGE_THRESHOLD))
-    {
-      mp_size_t scratch;
-      mp_ptr tp;
-      mp_size_t gsize;
-      TMP_DECL;
-
-      TMP_MARK;
-
-      scratch = GCD_LEHMER_ITCH (usize);
-      tp = TMP_ALLOC_LIMBS (scratch);
-
-      gsize = gcd_lehmer (gp, up, usize, vp, vsize, tp, scratch);
-      TMP_FREE;
-      return gsize;
-    }
-  else
-#endif
-    {
-      mp_size_t scratch;
-      mp_ptr tp;
-      mp_size_t gsize;
-
-      scratch = gcd_schoenhage_itch (usize);
-      tp = __GMP_ALLOCATE_FUNC_LIMBS (scratch);
-
-      gsize = gcd_schoenhage (gp, up, usize, vp, vsize, tp, scratch);
-      __GMP_FREE_FUNC_LIMBS (tp, scratch);
-      return gsize;
+      p_table[n] = best_p;
     }
+  TMP_FREE;
+  gmp_randclear(rands);
+  return 0;
 }
+#endif /* TUNE_GCD_P */
diff --git a/mpn/generic/gcd_lehmer.c b/mpn/generic/gcd_lehmer.c
new file mode 100644
index 000000000..42a7ddefc
--- /dev/null
+++ b/mpn/generic/gcd_lehmer.c
@@ -0,0 +1,161 @@
+/* gcd_lehmer.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Use binary algorithm to compute G <-- GCD (U, V) for usize, vsize == 2.
+   Both U and V must be odd. */
+static inline mp_size_t
+gcd_2 (mp_ptr gp, mp_srcptr up, mp_srcptr vp)
+{
+  mp_limb_t u0, u1, v0, v1;
+  mp_size_t gn;
+
+  u0 = up[0];
+  u1 = up[1];
+  v0 = vp[0];
+  v1 = vp[1];
+
+  ASSERT (u0 & 1);
+  ASSERT (v0 & 1);
+
+  /* Check for u0 != v0 needed to ensure that argument to
+   * count_trailing_zeros is non-zero. */
+  while (u1 != v1 && u0 != v0)
+    {
+      unsigned long int r;
+      if (u1 > v1)
+	{
+	  u1 -= v1 + (u0 < v0);
+	  u0 = (u0 - v0) & GMP_NUMB_MASK;
+	  count_trailing_zeros (r, u0);
+	  u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r);
+	  u1 >>= r;
+	}
+      else  /* u1 < v1.  */
+	{
+	  v1 -= u1 + (v0 < u0);
+	  v0 = (v0 - u0) & GMP_NUMB_MASK;
+	  count_trailing_zeros (r, v0);
+	  v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r);
+	  v1 >>= r;
+	}
+    }
+
+  gp[0] = u0, gp[1] = u1, gn = 1 + (u1 != 0);
+
+  /* If U == V == GCD, done.  Otherwise, compute GCD (V, |U - V|).  */
+  if (u1 == v1 && u0 == v0)
+    return gn;
+
+  v0 = (u0 == v0) ? ((u1 > v1) ? u1-v1 : v1-u1) : ((u0 > v0) ? u0-v0 : v0-u0);
+  gp[0] = mpn_gcd_1 (gp, gn, v0);
+
+  return 1;
+}
+
+/* Temporary storage: n */
+mp_size_t
+mpn_gcd_lehmer_n (mp_ptr gp, mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
+{
+  mp_size_t scratch;
+
+  /* Relax this requirement, and normalize at the start? Must disallow
+     A = B = 0, though. */
+  ASSERT(ap[n-1] > 0 || bp[n-1] > 0);
+
+  while (n > 2)
+    {
+      struct hgcd_matrix1 M;
+      mp_limb_t ah, al, bh, bl;
+      mp_limb_t mask;
+
+      mask = ap[n-1] | bp[n-1];
+      ASSERT (mask > 0);
+
+      if (mask & GMP_NUMB_HIGHBIT)
+	{
+	  ah = ap[n-1]; al = ap[n-2];
+	  bh = bp[n-1]; bl = bp[n-2];
+	}
+      else
+	{
+	  int shift;
+
+	  count_leading_zeros (shift, mask);
+	  ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+	  al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+	  bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+	  bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
+	}
+
+      /* Try an mpn_nhgcd2 step */
+      if (mpn_hgcd2 (ah, al, bh, bl, &M))
+	/* Temporary storage n */
+	n = mpn_hgcd_mul_matrix1_inverse_vector (&M, n, ap, bp, tp);
+
+      else
+	{
+	  /* mpn_hgcd2 has failed. Then either one of a or b is very
+	     small, or the difference is very small. Perform one
+	     subtraction followed by one division. */
+	  mp_size_t gn;
+
+	  /* Temporary storage n */
+	  n = mpn_gcd_subdiv_step (gp, &gn, ap, bp, n, tp);
+	  if (n == 0)
+	    return gn;
+	}
+    }
+
+  if (n == 1)
+    {
+      *gp = mpn_gcd_1(ap, 1, bp[0]);
+      return 1;
+    }
+
+  /* Due to the calling convention for mpn_gcd, at most one can be
+     even. */
+
+  if (! (ap[0] & 1))
+    MP_PTR_SWAP (ap, bp);
+
+  ASSERT (ap[0] & 1);
+
+  if (bp[0] == 0)
+    {
+      *gp = mpn_gcd_1 (ap, 2, bp[1]);
+      return 1;
+    }
+  else if (! (bp[0] & 1))
+    {
+      int r;
+      count_trailing_zeros (r, bp[0]);
+      bp[0] = ((bp[1] << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (bp[0] >> r);
+      bp[1] >>= r;
+    }
+
+  return gcd_2(gp, ap, bp);
+}
diff --git a/mpn/generic/gcd_subdiv_step.c b/mpn/generic/gcd_subdiv_step.c
new file mode 100644
index 000000000..d9708e8e1
--- /dev/null
+++ b/mpn/generic/gcd_subdiv_step.c
@@ -0,0 +1,116 @@
+/* gcd_subdiv_step.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static inline int
+mpn_zero_p (mp_srcptr ap, mp_size_t n)
+{
+  mp_size_t i;
+  for (i = n - 1; i >= 0; i--)
+    {
+      if (ap[i] != 0)
+	return 0;
+    }
+  return 1;
+}
+
+/* Used when mpn_hgcd or mpn_hgcd2 has failed. Then either one of a or
+   b is small, or the difference is small. Perform one subtraction
+   followed by one division. If the gcd is found, stores it in gp and
+   *gn, and returns zero. Otherwise, compute the reduced a and b, and
+   return the new size. */
+
+/* FIXME: Check when the smaller number is a single limb, and invoke
+ * mpn_gcd_1. */
+mp_size_t
+mpn_gcd_subdiv_step (mp_ptr gp, mp_size_t *gn,
+		     mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
+{
+  mp_size_t an, bn;
+
+  ASSERT (n > 0);
+  ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
+
+  an = bn = n;
+  MPN_NORMALIZE (ap, an);
+  MPN_NORMALIZE (bp, bn);
+
+  if (UNLIKELY (an == 0))
+    {
+    return_b:
+      MPN_COPY (gp, bp, bn);
+      *gn = bn;
+      return 0;
+    }
+  else if (UNLIKELY (bn == 0))
+    {
+    return_a:
+      MPN_COPY (gp, ap, an);
+      *gn = an;
+      return 0;
+    }
+
+  /* Arrange so that a > b, subtract an -= bn, and maintain
+     normalization. */
+  if (an < bn)
+    MPN_PTR_SWAP (ap, an, bp, bn);
+  else if (an == bn)
+    {
+      int c;
+      MPN_CMP (c, ap, bp, an);
+      if (UNLIKELY (c == 0))
+	goto return_a;
+      else if (c < 0)
+	MP_PTR_SWAP (ap, bp);
+    }
+
+  ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn));
+  MPN_NORMALIZE (ap, an);
+  ASSERT (an > 0);
+
+  /* Arrange so that a > b, and divide a = q b + r */
+  /* FIXME: an < bn happens when we have cancellation. If that is the
+     common case, then we could reverse the roles of a and b to avoid
+     the swap. */
+  if (an < bn)
+    MPN_PTR_SWAP (ap, an, bp, bn);
+  else if (an == bn)
+    {
+      int c;
+      MPN_CMP (c, ap, bp, an);
+      if (UNLIKELY (c == 0))
+	goto return_a;
+      else if (c < 0)
+	MP_PTR_SWAP (ap, bp);
+    }
+
+  mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
+
+  if (mpn_zero_p (ap, bn))
+    goto return_b;
+
+  return bn;
+}
diff --git a/mpn/generic/gcdext.c b/mpn/generic/gcdext.c
index 63528f98e..94d490791 100644
--- a/mpn/generic/gcdext.c
+++ b/mpn/generic/gcdext.c
@@ -18,819 +18,101 @@ License for more details.
 You should have received a copy of the GNU Lesser General Public License
 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
-#define WANT_TRACE 0
-
-/* Default to binary gcdext_1, since it is best on most current machines.
-   We should teach tuneup to choose the right gcdext_1.  */
-#define GCDEXT_1_USE_BINARY 1
-
-#if WANT_TRACE
-# include <stdio.h>
-# include <stdarg.h>
-#endif
-
 #include "gmp.h"
 #include "gmp-impl.h"
 #include "longlong.h"
 
-#ifndef NULL
-# define NULL ((void *) 0)
-#endif
-
-#if WANT_TRACE
-static void
-trace (const char *format, ...)
-{
-  va_list args;
-  va_start (args, format);
-  gmp_vfprintf (stderr, format, args);
-  va_end (args);
-}
-#endif
-
-/* Comparison of _normalized_ numbers. */
-
-#define MPN_EQUAL_P(ap, asize, bp, bsize)			\
-((asize) == (bsize) && mpn_cmp ((ap), (bp), (asize)) == 0)
-
-#define MPN_LEQ_P(ap, asize, bp, bsize)				\
-((asize) < (bsize) || ((asize) == (bsize)			\
-		       && mpn_cmp ((ap), (bp), (asize)) <= 0))
-
-/* Returns g, u and v such that g = u A - v B. There are three
-   different cases for the result:
-
-     g = u A - v B, 0 < u < b, 0 < v < a
-     g = A          u = 1, v = 0
-     g = B          u = B, v = A - 1
-
-   We always return with 0 < u <= b, 0 <= v < a.
-*/
-#if GCDEXT_1_USE_BINARY
-
-static mp_limb_t
-gcdext_1_odd (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
+static inline int
+mpn_zero_p (mp_srcptr ap, mp_size_t n)
 {
-  mp_limb_t u0;
-  mp_limb_t v0;
-  mp_limb_t v1;
-  mp_limb_t u1;
-
-  mp_limb_t B = b;
-  mp_limb_t A = a;
-
-  /* Through out this function maintain
-
-     a = u0 A - v0 B
-     b = u1 A - v1 B
-
-     where A and B are odd. */
-
-  u0 = 1; v0 = 0;
-  u1 = b; v1 = a-1;
-
-  if (A == 1)
-    {
-      *up = u0; *vp = v0;
-      return 1;
-    }
-  else if (B == 1)
-    {
-      *up = u1; *vp = v1;
-      return 1;
-    }
-
-  while (a != b)
-    {
-      mp_limb_t mask;
-
-      ASSERT (a % 2 == 1);
-      ASSERT (b % 2 == 1);
-
-      ASSERT (0 < u0); ASSERT (u0 <= B);
-      ASSERT (0 < u1); ASSERT (u1 <= B);
-
-      ASSERT (0 <= v0); ASSERT (v0 < A);
-      ASSERT (0 <= v1); ASSERT (v1 < A);
-
-      if (a > b)
-	{
-	  MP_LIMB_T_SWAP (a, b);
-	  MP_LIMB_T_SWAP (u0, u1);
-	  MP_LIMB_T_SWAP (v0, v1);
-	}
-
-      ASSERT (a < b);
-
-      /* Makes b even */
-      b -= a;
-
-      mask = - (mp_limb_t) (u1 < u0);
-      u1 += B & mask;
-      v1 += A & mask;
-      u1 -= u0;
-      v1 -= v0;
-
-      ASSERT (b % 2 == 0);
-
-      do
-	{
-	  /* As b = u1 A + v1 B is even, while A and B are odd,
-	     either both or none of u1, v1 is even */
-
-	  ASSERT (u1 % 2 == v1 % 2);
-
-	  mask = -(u1 & 1);
-	  u1 = u1 / 2 + ((B / 2) & mask) - mask;
-	  v1 = v1 / 2 + ((A / 2) & mask) - mask;
-
-	  b /= 2;
-	}
-      while (b % 2 == 0);
-    }
-
-  /* Now g = a = b */
-  ASSERT (a == b);
-  ASSERT (u1 <= B);
-  ASSERT (v1 < A);
-
-  ASSERT (A % a == 0);
-  ASSERT (B % a == 0);
-  ASSERT (u0 % (B/a) == u1 % (B/a));
-  ASSERT (v0 % (A/a) == v1 % (A/a));
-
-  *up = u0; *vp = v0;
-
-  return a;
-}
-
-static mp_limb_t
-gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
-{
-  unsigned shift = 0;
-  mp_limb_t g;
-  mp_limb_t u;
-  mp_limb_t v;
-
-  /* We use unsigned values in the range 0, ... B - 1. As the values
-     are uniquely determined only modulo B, we can add B at will, to
-     get numbers in range or flip the least significant bit. */
-  /* Deal with powers of two */
-  while ((a | b) % 2 == 0)
-    {
-      a /= 2; b /= 2; shift++;
-    }
-
-  if (b % 2 == 0)
-    {
-      unsigned k = 0;
-
-      do {
-	b /= 2; k++;
-      } while (b % 2 == 0);
-
-      g = gcdext_1_odd (&u, &v, a, b);
-
-      while (k--)
-	{
-	  /* We have g = u a + v b, and need to construct
-	     g = u'a + v'(2b).
-
-	     If v is even, we can just set u' = u, v' = v/2
-	     If v is odd, we can set v' = (v + a)/2, u' = u + b
-	  */
-
-	  if (v % 2 == 0)
-	    v /= 2;
-	  else
-	    {
-	      u = u + b;
-	      v = v/2 + a/2 + 1;
-	    }
-	  b *= 2;
-	}
-    }
-  else if (a % 2 == 0)
-    {
-      unsigned k = 0;
-
-      do {
-	a /= 2; k++;
-      } while (a % 2 == 0);
-
-      g = gcdext_1_odd (&u, &v, a, b);
-
-      while (k--)
-	{
-	  /* We have g = u a + v b, and need to construct
-	     g = u'(2a) + v'b.
-
-	     If u is even, we can just set u' = u/2, v' = v.
-	     If u is odd, we can set u' = (u + b)/2
-	  */
-
-	  if (u % 2 == 0)
-	    u /= 2;
-	  else
-	    {
-	      u = u/2 + b/2 + 1;
-	      v = v + a;
-	    }
-	  a *= 2;
-	}
-    }
-  else
-    /* Ok, both are odd */
-    g = gcdext_1_odd (&u, &v, a, b);
-
-  *up = u;
-  *vp = v;
-
-  return g << shift;
-}
-
-#else /* ! GCDEXT_1_USE_BINARY */
-static mp_limb_t
-gcdext_1_u (mp_limb_t *up, mp_limb_t a, mp_limb_t b)
-{
-  /* Maintain
-
-     a =   u0 A mod B
-     b = - u1 A mod B
-  */
-  mp_limb_t u0 = 1;
-  mp_limb_t u1 = 0;
-  mp_limb_t B = b;
-
-  ASSERT (a >= b);
-  ASSERT (b > 0);
-
-  for (;;)
+  mp_size_t i;
+  for (i = n - 1; i >= 0; i--)
     {
-      mp_limb_t q;
-
-      q = a / b;
-      a -= q * b;
-
-      if (a == 0)
-	{
-	  *up = B - u1;
-	  return b;
-	}
-      u0 += q * u1;
-
-      q = b / a;
-      b -= q * a;
-
-      if (b == 0)
-	{
-	  *up = u0;
-	  return a;
-	}
-      u1 += q * u0;
+      if (ap[i] != 0)
+	return 0;
     }
+  return 1;
 }
 
-static mp_limb_t
-gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
-{
-  /* Maintain
-
-     a =   u0 A - v0 B
-     b = - u1 A + v1 B = (B - u1) A - (A - v1) B
-  */
-  mp_limb_t u0 = 1;
-  mp_limb_t v0 = 0;
-  mp_limb_t u1 = 0;
-  mp_limb_t v1 = 1;
-
-  mp_limb_t A = a;
-  mp_limb_t B = b;
-
-  ASSERT (a >= b);
-  ASSERT (b > 0);
-
-  for (;;)
-    {
-      mp_limb_t q;
-
-      q = a / b;
-      a -= q * b;
-
-      if (a == 0)
-	{
-	  *up = B - u1;
-	  *vp = A - v1;
-	  return b;
-	}
-      u0 += q * u1;
-      v0 += q * v1;
+/* Computes r = u0 x0 + u1 x1. Needs n = un + xn limbs of temporary
+   storage. Result is of size n-1, n or n+1, and the size is returned
+   (if inputs are non-normalized, result may be non-normalized too).
 
-      q = b / a;
-      b -= q * a;
+   No overlap between input and output is allowed, since rp is used
+   for temporary storage. */
 
-      if (b == 0)
-	{
-	  *up = u0;
-	  *vp = v0;
-	  return a;
-	}
-      u1 += q * u0;
-      v1 += q * v0;
-    }
-}
-#endif /* ! GCDEXT_1_USE_BINARY */
-
-/* FIXME: Duplicated in gcd.c */
 static mp_size_t
-hgcd_tdiv (mp_ptr qp,
-	   mp_ptr rp, mp_size_t *rsizep,
-	   mp_srcptr ap, mp_size_t asize,
-	   mp_srcptr bp, mp_size_t bsize)
+addmul2_n (mp_ptr rp,
+	   mp_srcptr u0, mp_srcptr u1, mp_size_t un,
+	   mp_srcptr x0, mp_srcptr x1, mp_size_t xn,
+	   mp_ptr tp)
 {
-  mp_size_t qsize;
-  mp_size_t rsize;
-
-  mpn_tdiv_qr (qp, rp, 0, ap, asize, bp, bsize);
-
-  rsize = bsize;
-  MPN_NORMALIZE (rp, rsize);
-  *rsizep = rsize;
-
-  qsize = asize - bsize + 1;
-  qsize -= (qp[qsize - 1] == 0);
-
-  if (qsize == 1 && qp[0] == 1)
-    return 0;
-
-  return qsize;
-}
-
-/* FIXME: Duplicated in hgcd.c */
-static mp_limb_t
-mpn_addmul2_n_1 (mp_ptr rp, mp_size_t n,
-		 mp_ptr ap, mp_limb_t u,
-		 mp_ptr bp, mp_limb_t v)
-{
-  mp_limb_t h;
   mp_limb_t cy;
+  mp_size_t n;
 
-  h = mpn_mul_1 (rp, ap, n, u);
-  cy = mpn_addmul_1 (rp, bp, n, v);
-  h += cy;
-#if GMP_NAIL_BITS == 0
-  rp[n] = h;
-  return (h < cy);
-#else /* GMP_NAIL_BITS > 0 */
-  rp[n] = h & GMP_NUMB_MASK;
-  return h >> GMP_NUMB_BITS;
-#endif /* GMP_NAIL_BITS > 0 */
-}
-
-
-/* Computes u2 = u0 + q u1
-
-   Returns new size.
-
-   FIXME: Notation in the function not quite consistent
-   FIXME: Severe code duplication with hgcd_update_uv */
-
-static mp_size_t
-hgcd_update_u (struct hgcd_row *r, mp_size_t usize,
-	       mp_srcptr qp, mp_size_t qsize,
-	       /* Limbs allocated for the new u, for sanity
-		   checking */
-	       mp_size_t alloc)
-{
-  mp_srcptr u0p = r[0].uvp[0];
-  mp_srcptr u1p = r[1].uvp[0];
-  mp_ptr u2p = r[2].uvp[0];
-
-  ASSERT (usize < alloc);
-
-  /* u1 = 0 is an exceptional case. Except for this, u1 should be
-     normalized. */
-
-  ASSERT ((usize == 1 && u1p[0] == 0) || u1p[usize - 1] != 0);
-
-  /* Compute u2  = u0 + q u1 */
-
-  if (usize == 1 && u1p[0] == 0)
-    {
-      /* u1 == 0 is a special case, then q might be large, but it
-	 doesn't matter. Can happen only when u0 = v1 = 1, u1 = v0 =
-	 0, and hence usize == 1. */
-      MPN_COPY (u2p, u0p, usize);
-    }
-  else if (qsize == 0)
-    /* Represents a unit quotient */
-    {
-      mp_limb_t cy = mpn_add_n (u2p, u0p, u1p, usize);
-      u2p[usize] = cy;
-      usize += (cy != 0);
-    }
-  else if (qsize == 1)
-    {
-      mp_limb_t cy;
-
-      cy = mpn_mul_1 (u2p, u1p, usize, qp[0]);
-      cy += mpn_add_n (u2p, u2p, u0p, usize);
-
-      u2p[usize] = cy;
-      usize += (cy != 0);
-    }
-  else
-    {
-      if (qsize <= usize)
-	mpn_mul (u2p, u1p, usize, qp, qsize);
-      else
-	mpn_mul (u2p, qp, qsize, u1p, usize);
-
-      ASSERT_NOCARRY (mpn_add (u2p,
-			       u2p, usize + qsize,
-			       u0p, usize));
-
-      usize += qsize;
-      usize -= (u2p[usize - 1] == 0);
-    }
-  ASSERT (mpn_cmp (r[1].uvp[0], r[2].uvp[0], usize) <= 0);
-  ASSERT (r[2].uvp[0][usize - 1] != 0);
-
-  return usize;
-}
-
-
-/* Computes Y = R * X. No overlap allowed. */
-static mp_size_t
-hgcd2_mul_vector (struct hgcd_row *Y,
-		  mp_size_t alloc,
-		  const struct hgcd2_row *R,
-		  const struct hgcd_row *X, mp_size_t n)
-{
-  unsigned i;
-  int grow = 0;
-  mp_limb_t h = 0;
-
-  ASSERT (n < alloc);
-
-  for (i = 0; i < 2; i++)
-    {
-      /* Set Y[i] = R[i, 0] X[0] + R[i,1] X[1]
-		  = u X[0] + v X[0] */
-      mp_limb_t cy;
-
-      cy = mpn_addmul2_n_1 (Y[i].uvp[0], n,
-			    X[0].uvp[0], R[i].u,
-			    X[1].uvp[0], R[i].v);
-
-      if (cy)
-	{
-	  ASSERT (n + 2 <= alloc);
-	  Y[i].uvp[0][n+1] = cy;
-	  grow = 1;
-	}
-      else
-	h |= Y[i].uvp[0][n];
-    }
-  if (grow)
-    return n + 2;
-  else
-    /* Don't add redundant zeroes */
-    return n + (h != 0);
-}
-
-/* Sets (a, b, c)  <--  (b, c, a) */
-#define HGCD_SWAP3_LEFT(row)				\
-do {							\
-  struct hgcd_row __hgcd_swap4_left_tmp = row[0];	\
-  row[0] = row[1];					\
-  row[1] = row[2];					\
-  row[2] = __hgcd_swap4_left_tmp;			\
-} while (0)
-
-/* Sets (a, b, c, d)  <--  (c, d, a, b) */
-#define HGCD_SWAP4_2(row)				\
-do {							\
-  struct hgcd_row __hgcd_swap4_2_tmp = row[0];	\
-  row[0] = row[2];					\
-  row[2] = __hgcd_swap4_2_tmp;				\
-  __hgcd_swap4_2_tmp = row[1];				\
-  row[1] = row[3];					\
-  row[3] = __hgcd_swap4_2_tmp;				\
-} while (0)
-
-static mp_size_t
-gcdext_lehmer_itch (mp_size_t asize, mp_size_t bsize)
-{
-  mp_size_t ralloc = asize + 1;
-  mp_size_t ualloc = bsize + 1;
-
-  return 4 * ralloc + 4 * ualloc + asize;
-}
-
-static mp_size_t
-gcdext_lehmer (mp_ptr gp, mp_ptr up, mp_size_t *usize,
-	       mp_srcptr ap, mp_size_t asize,
-	       mp_srcptr bp, mp_size_t bsize,
-	       mp_ptr tp, mp_size_t talloc)
-{
-  struct hgcd_row r[4];
-  /* Size and sign of u fields. The largest u should be normalized to
-     this size, and except for the case u1 = 0, that is the latest
-     u. */
-  int rsize;
-  int rsign;
-
-  mp_ptr qp;
-  mp_size_t qsize;
-  mp_size_t ralloc = asize + 1;
-  mp_size_t ualloc = bsize + 1;
-
-  struct hgcd2 hgcd;
-  int res;
-
-  ASSERT (asize >= bsize);
-  ASSERT (asize > 1);
-  ASSERT (bsize > 0);
-
-  ASSERT (MPN_LEQ_P (bp, bsize, ap, asize));
-
-  ASSERT (4 * ralloc + 4*ualloc + asize <= talloc);
-
-  r[0].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[1].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[2].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[3].rp = tp; tp += ralloc; talloc -= ralloc;
-
-  /* Must zero out the u fields. We don't use the v fields. */
-  MPN_ZERO (tp, 4 * ualloc);
-
-  r[0].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
-  r[1].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
-  r[2].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
-  r[3].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
-
-  qp = tp; tp += asize; talloc -= asize;
-
-  res = mpn_hgcd2_lehmer_step (&hgcd,
-			       ap, asize,
-			       bp, bsize,
-			       NULL);
-
-  if (res == 0 || (res == 2 && hgcd.row[0].v == 0))
+  if (xn >= un)
     {
-      qsize = hgcd_tdiv (qp, r[1].rp, &r[1].rsize,
-			 ap, asize,
-			 bp, bsize);
-      MPN_COPY (r[0].rp, bp, bsize);
-      r[0].rsize = bsize;
-
-      r[0].uvp[0][0] = 0;
-      r[1].uvp[0][0] = 1;
-      rsign = -1;
+      mpn_mul (rp, x0, xn, u0, un);
+      mpn_mul (tp, x1, xn, u1, un);
     }
   else
     {
-      const struct hgcd2_row *s = hgcd.row + (res - 2);
-      rsign = hgcd.sign;
-      if (res == 3)
-	rsign = ~rsign;
-
-      /* s[0] and s[1] correct. */
-      r[0].rsize
-	= mpn_hgcd2_fix (r[0].rp, ralloc,
-			 rsign,
-			 s[0].u, ap, asize,
-			 s[0].v, bp, bsize);
-
-      r[1].rsize
-	= mpn_hgcd2_fix (r[1].rp, ralloc,
-			 ~rsign,
-			 s[1].u, ap, asize,
-			 s[1].v, bp, bsize);
-
-      r[0].uvp[0][0] = s[0].u;
-      r[1].uvp[0][0] = s[1].u;
-    }
-  rsize = 1;
-
-  while (r[0].rsize >= 2 && r[1].rsize > 0)
-    {
-      res = mpn_hgcd2_lehmer_step (&hgcd,
-				   r[0].rp, r[0].rsize,
-				   r[1].rp, r[1].rsize,
-				   NULL);
-
-      if (res == 0 || (res == 2 && hgcd.row[0].v == 0))
-	{
-	  qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize,
-			     r[0].rp, r[0].rsize,
-			     r[1].rp, r[1].rsize);
-	  rsize = hgcd_update_u (r, rsize, qp, qsize, ualloc);
-	  HGCD_SWAP3_LEFT (r);
-	  rsign = ~rsign;
-	}
-      else
-	{
-	  const struct hgcd2_row *s = hgcd.row + (res - 2);
-	  int sign = hgcd.sign;
-	  if (res == 3)
-	    sign = ~sign;
-
-	  /* s[0] and s[1] correct. */
-	  r[2].rsize
-	    = mpn_hgcd2_fix (r[2].rp, ralloc,
-			     sign,
-			     s[0].u, r[0].rp, r[0].rsize,
-			     s[0].v, r[1].rp, r[1].rsize);
-
-	  r[3].rsize
-	    = mpn_hgcd2_fix (r[3].rp, ralloc,
-			     ~sign,
-			     s[1].u, r[0].rp, r[0].rsize,
-			     s[1].v, r[1].rp, r[1].rsize);
-
-	  rsize = hgcd2_mul_vector (r + 2, ralloc, s, r, rsize);
-	  rsign ^= sign;
-	  HGCD_SWAP4_2 (r);
-	}
+      mpn_mul (rp, u0, un, x0, xn);
+      mpn_mul (tp, u1, un, x1, xn);
     }
 
-  if (r[1].rsize == 0)
-    {
-      MPN_NORMALIZE (r[0].uvp[0], rsize);
-      MPN_COPY (gp, r[0].rp, r[0].rsize);
-      MPN_COPY (up, r[0].uvp[0], rsize);
+  n = un + xn;
+  cy = mpn_add_n (rp, rp, tp, n);
 
-      *usize = (rsign >= 0) ? rsize : -rsize;
-      return r[0].rsize;
-    }
+  if (cy > 0)
+    rp[n++] = cy;
   else
-    {
-      mp_limb_t cy;
-      mp_limb_t u;
-      mp_limb_t v;
-
-      gp[0] = gcdext_1 (&u, &v, r[0].rp[0], r[1].rp[0]);
-      cy = mpn_addmul2_n_1 (up, rsize,
-			    r[0].uvp[0], u,
-			    r[1].uvp[0], v);
-      rsize++;
-      if (cy)
-	up[rsize++] = cy;
-      else
-	MPN_NORMALIZE (up, rsize);
+    MPN_NORMALIZE (rp, n);
 
-      *usize = (rsign >= 0) ? rsize : -rsize;
-      return 1;
-    }
+  return n;
 }
 
-/* Computes Y = R * X. No overlap allowed.
-
-   Temporary space is needed for two numbers smaller than the
-   resulting matrix elements, i.e. bounded by 2*L <= N.
-
-   FIXME: Severe code duplication with hgcd.c: hgcd_mul. */
+#define COMPUTE_V_ITCH(n) (2*(n) + 1)
 
+/* Computes |v| = |(g - u a)| / b, where u may be positive or
+   negative, and v is of the opposite sign. a, b are of size n, u and
+   v at most size n, and v must have space for n+1 limbs. */
 static mp_size_t
-hgcd_mul_vector (struct hgcd_row *Y, mp_size_t alloc,
-		 const struct hgcd_row *R, mp_size_t rsize,
-		 const struct hgcd_row *X, mp_size_t xsize,
-		 mp_ptr tp, mp_size_t talloc)
-{
-  unsigned i;
-
-  mp_size_t ysize;
-  mp_limb_t h;
-  int grow;
-
-  MPN_NORMALIZE (R[1].uvp[1], rsize);
-  /* u1 = 0 is an exceptional case. Except for this, u1 should be
-     normalized. */
-  ASSERT ((xsize == 1 && X[1].uvp[0][0] == 0)
-	  || X[1].uvp[0][xsize - 1] != 0);
-
-  if (xsize == 1 && X[1].uvp[0][0] == 0)
-    {
-      /* Special case. Set Y[i, 0] = R[i, 0] */
-      ASSERT (X[0].uvp[0][0] == 1);
-
-      if (rsize > 1)
-	MPN_NORMALIZE (R[1].uvp[0], rsize);
-      MPN_COPY (Y[0].uvp[0], R[0].uvp[0], rsize);
-      MPN_COPY (Y[1].uvp[0], R[1].uvp[0], rsize);
-
-      return rsize;
-    }
-
-  ysize = rsize + xsize;
-  ASSERT (ysize <= talloc);
-
-  h = 0; grow = 0;
-
-  if (rsize >= xsize)
-    {
-      for (i = 0; i < 2; i++)
-	{
-	  /* Set Y[i, 0] = R[i, 0] X[0, 0] + R[i,1] X[1, 0] */
-	  mp_limb_t cy;
-
-	  mpn_mul (Y[i].uvp[0], R[i].uvp[0], rsize, X[0].uvp[0], xsize);
-	  mpn_mul (tp, R[i].uvp[1], rsize, X[1].uvp[0], xsize);
-
-	  cy = mpn_add_n (Y[i].uvp[0], Y[i].uvp[0], tp, ysize);
-
-	  if (cy)
-	    {
-	      ASSERT (ysize + 1 < alloc);
-	      Y[i].uvp[0][ysize] = cy;
-	      grow = 1;
-	    }
-	  else
-	    h |= Y[i].uvp[0][ysize - 1];
-	}
-    }
-  else
-    {
-      for (i = 0; i < 2; i++)
-	{
-	  /* Set Y[i, 0] = R[i, 0] X[0, 0] + R[i,1] X[1, 0] */
-	  mp_limb_t cy;
-
-	  mpn_mul (Y[i].uvp[0], X[0].uvp[0], xsize, R[i].uvp[0], rsize);
-	  mpn_mul (tp, X[1].uvp[0], xsize, R[i].uvp[1], rsize);
-
-	  cy = mpn_add_n (Y[i].uvp[0], Y[i].uvp[0], tp, ysize);
-
-	  if (cy)
-	    {
-	      ASSERT (ysize + 1 < alloc);
-	      Y[i].uvp[0][ysize] = cy;
-	      grow = 1;
-	    }
-	  else
-	    h |= Y[i].uvp[0][ysize - 1];
-	}
-    }
-
-  if (grow)
-    ysize++;
-  else
-    ysize -= (h == 0);
-
-  ASSERT ((ysize == 1 && Y[1].uvp[0][0] == 0) || Y[1].uvp[0][ysize - 1] != 0);
-
-  return ysize;
-}
-
-#define COMPUTE_V_ITCH(asize, bsize, usize) \
-  ((usize) + (asize) + 1 + (bsize))
-
-/* Computes |v| = |(c - u a)| / b, where u may be positive or negative,
-   and v is of the opposite sign. Requires that b, c, |u| <= a. */
-static mp_size_t
-compute_v (mp_ptr vp, mp_size_t valloc,
-	   mp_srcptr ap, mp_size_t asize,
-	   mp_srcptr bp, mp_size_t bsize,
-	   mp_srcptr cp, mp_size_t csize,
+compute_v (mp_ptr vp,
+	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
+	   mp_srcptr gp, mp_size_t gn,
 	   mp_srcptr up, mp_size_t usize,
-	   mp_ptr tp, mp_size_t talloc)
+	   mp_ptr tp)
 {
   mp_size_t size;
-  mp_size_t vsize;
-  mp_ptr rp;
-
-  ASSERT (asize);
-  ASSERT (bsize);
-  ASSERT (csize);
-  ASSERT (asize >= bsize);
-
-#if 0
-  trace ("compute_v: a = %Nd\n"
-	"           b = %Nd\n"
-	"           c = %Nd\n"
-	"           u = %Nd\n",
-	ap, asize, bp, bsize, cp, csize, up, usize);
-#endif
-
-  ASSERT (usize);
-
+  mp_size_t an;
+  mp_size_t bn;
+  mp_size_t vn;
+
+  ASSERT (n > 0);
+  ASSERT (gn > 0);
+  ASSERT (usize != 0);
+  
   size = ABS (usize);
+  ASSERT (size <= n);
 
-  ASSERT (size <= asize);
-  ASSERT (asize + size <= talloc);
+  an = n;
+  MPN_NORMALIZE (ap, an);
 
-  mpn_mul (tp, ap, asize, up, size);
-  size += asize;
+  if (an >= size)
+    mpn_mul (tp, ap, an, up, size);
+  else
+    mpn_mul (tp, up, size, ap, an);
+    
+  size += an;
 
-  ASSERT (csize <= size);
+  ASSERT (gn <= size);
 
   if (usize > 0)
     {
-      /* |v| = -v = (u a - c) / b */
+      /* |v| = -v = (u a - g) / b */
 
-      ASSERT_NOCARRY (mpn_sub (tp, tp, size, cp, csize));
+      ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
       MPN_NORMALIZE (tp, size);
       if (size == 0)
 	return 0;
@@ -838,495 +120,432 @@ compute_v (mp_ptr vp, mp_size_t valloc,
   else
     { /* usize < 0 */
       /* |v| = v = (c - u a) / b = (c + |u| a) / b */
-      mp_limb_t cy = mpn_add (tp, tp, size, cp, csize);
+      mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
       if (cy)
-	{
-	  ASSERT (size < talloc);
-	  tp[size++] = cy;
-	}
+	tp[size++] = cy;
     }
 
   /* Now divide t / b. There must be no remainder */
+  bn = n;
+  MPN_NORMALIZE (bp, bn);
+  ASSERT (size >= bn);
 
-  ASSERT (size >= bsize);
-  ASSERT (size + bsize <= talloc);
-  rp = tp + size;
-
-  vsize = size + 1 - bsize;
-  ASSERT (vsize <= valloc);
+  vn = size + 1 - bn;
+  ASSERT (vn <= n + 1);
 
-  mpn_tdiv_qr (vp, rp, 0, tp, size, bp, bsize);
-  MPN_NORMALIZE (vp, vsize);
+  /* FIXME: Use divexact. Or do the entire calculation mod 2^{n *
+     GMP_NUMB_BITS}. */
+  mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn);
+  vn -= (vp[vn-1] == 0);
 
   /* Remainder must be zero */
 #if WANT_ASSERT
   {
     mp_size_t i;
-    for (i = 0; i < bsize; i++)
+    for (i = 0; i < bn; i++)
       {
-	ASSERT (rp[i] == 0);
+	ASSERT (tp[i] == 0);
       }
   }
 #endif
-  return vsize;
+  return vn;
 }
 
-static mp_size_t
-gcdext_schoenhage_itch (mp_size_t asize, mp_size_t bsize)
-{
-  mp_size_t itch;
-
-  mp_size_t ralloc = asize + 1;
-  mp_size_t ualloc = bsize + 1;
-  /* Input size for hgcd calls */
-  mp_size_t halloc = (asize + 1) / 2;
+/* Temporary storage:
 
-  /* Storage for the rows and quotient */
-  mp_size_t rstorage = 4 * ralloc + 4 * ualloc + asize;
+   Initial division: Quotient of at most an - n + 1 <= an limbs.
 
-  /* Storage for hgcd calls */
-  mp_size_t tstorage = mpn_hgcd_init_itch (halloc)
-    + qstack_itch (halloc)
-    + mpn_hgcd_itch (halloc);
+   Storage for u0 and u1: 2(n+1).
 
-  /* Storage needed for final gcdext_lehmer */
-  mp_size_t lstorage
-    = gcdext_lehmer_itch (GCDEXT_SCHOENHAGE_THRESHOLD,
-			  GCDEXT_SCHOENHAGE_THRESHOLD);
+   Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
 
-  /* Storage needed after final nhgcd_gcdext_lehmer */
-  mp_size_t fstorage
-    = COMPUTE_V_ITCH (GCDEXT_SCHOENHAGE_THRESHOLD,
-		      GCDEXT_SCHOENHAGE_THRESHOLD,
-		      ualloc);
+   Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
+   
+   When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 3(n+1) for the cofactors.
+   
+   When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
+   
+   For the lehmer call after the loop, Let T denote
+   GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
+   u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
+   + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient.
+   
+*/
 
-  /* We need rstorage + MAX (tstorage, lstorage, fstorage) */
+/* Optimal choice of p seems difficult. In each iteration the division
+ * of work beteen hgcd and the updates of u0 and u1 depends on the
+ * current size of the u. It may be desirable to use a different
+ * choice of p in each iteration. Also the input size seems to matter;
+ * choosing p = n / 3 in the first iteration seems to improve
+ * performance slightly for input size just above the theshold, but
+ * degrade performance for larger inputs. */
+#define CHOOSE_P_1(n) ((n) / 2)
+#define CHOOSE_P_2(n) ((n) / 3)
 
-  itch = tstorage;
-  if (lstorage > tstorage)
-    itch = lstorage;
-  if (fstorage > itch)
-    itch = fstorage;
+mp_size_t
+mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
+	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
+{
+  mp_size_t talloc;
+  mp_size_t scratch;
+  mp_size_t matrix_scratch;
+  mp_size_t ualloc = n + 1;
 
-  return rstorage + itch;
-}
+  mp_size_t un;
+  mp_ptr u0;
+  mp_ptr u1;
 
-#if WANT_ASSERT
-static void
-sanity_check_row (mp_srcptr ap, mp_size_t asize,
-		  mp_srcptr bp, mp_size_t bsize,
-		  int sign, mp_size_t usize,
-		  const struct hgcd_row *r)
-{
-  /* Check that x = u * a + v * b, for some v, i.e. that
-     x - u*a is divisible by b. */
-  mp_srcptr up = r->uvp[0];
-  mp_srcptr xp = r->rp;
-  mp_size_t xsize = r->rsize;
   mp_ptr tp;
-  mp_size_t tsize;
-  mp_ptr qp;
-  mp_size_t qsize;
-  mp_ptr rp;
-  mp_size_t i;
+  
   TMP_DECL;
-  TMP_MARK;
 
-  ASSERT (asize > 0 && ap[asize - 1] != 0);
-  ASSERT (bsize > 0 && bp[bsize - 1] != 0);
-  ASSERT (xsize == 0 || xp[xsize - 1] != 0);
-  ASSERT (MPN_LEQ_P (xp, xsize, ap, asize));
-  ASSERT (MPN_LEQ_P (up, usize, bp, bsize));
+  ASSERT (an >= n);
+  ASSERT (n > 0);
 
-  MPN_NORMALIZE (up, usize);
-  if (usize == 0)
-    {
-      ASSERT (MPN_EQUAL_P (xp, xsize, bp, bsize));
-      return;
-    }
-
-  tp = TMP_ALLOC_LIMBS (usize + asize + 1);
-  qp = TMP_ALLOC_LIMBS (usize + asize + 2 - bsize);
-  rp = TMP_ALLOC_LIMBS (bsize);
-
-  mpn_mul (tp, ap, asize, up, usize);
-  tsize = asize + usize;
-  tsize -= (tp[tsize - 1] == 0);
+  TMP_MARK;
 
-  if (sign >= 0)
-    {
-      ASSERT_NOCARRY (mpn_sub (tp, tp, tsize, xp, xsize));
-      MPN_NORMALIZE (tp, tsize);
-    }
-  else
-    {
-      mp_limb_t cy = mpn_add (tp, tp, tsize, xp, xsize);
-      tp[tsize] = cy;
-      tsize += (cy != 0);
-    }
+  /* FIXME: Check for small sizes first, before setting up temporary
+     storage etc. */
+  talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
+  
+  /* For initial division */
+  scratch = an - n + 1;
+  if (scratch > talloc)
+    talloc = scratch;
 
-  if (tsize > 0)
+  if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
     {
-      mpn_tdiv_qr (qp, rp, 0, tp, tsize, bp, bsize);
-      for (i = 0; i < bsize; i++)
-	ASSERT (rp[i] == 0);
-      qsize = tsize - bsize;
-      qsize += (qp[qsize] != 0);
-      ASSERT (MPN_LEQ_P (qp, qsize, ap, asize));
-    }
-  TMP_FREE;
-}
-# define ASSERT_ROW(ap, asize, bp, bsize, sign, usize, r) \
-sanity_check_row (ap, asize, bp, bsize, sign, usize, r)
-
-#else /* !WANT_ASSERT */
-# define ASSERT_ROW(ap, asize, bp, bsize, sign, usize, r)
-#endif /* !WANT_ASSERT */
+      /* For hgcd loop. */
+      mp_size_t hgcd_scratch;
+      mp_size_t update_scratch;
+      mp_size_t p1 = CHOOSE_P_1 (n);
+      mp_size_t p2 = CHOOSE_P_2 (n);
+      mp_size_t min_p = MIN(p1, p2);
+      mp_size_t max_p = MAX(p1, p2);
+      matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
+      hgcd_scratch = mpn_hgcd_itch (n - min_p);
+      update_scratch = max_p + n - 1;
+      
+      scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
+      if (scratch > talloc)
+	talloc = scratch;
 
-static mp_size_t
-gcdext_schoenhage (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
-		   mp_srcptr ap, mp_size_t asize,
-		   mp_srcptr bp, mp_size_t bsize,
-		   mp_ptr tp, mp_size_t talloc)
-{
-  mp_size_t scratch;
-  struct hgcd hgcd;
-  struct qstack quotients;
-  struct hgcd_row r[4];
+      /* Final mpn_gcdext_lehmer_n call. Need space for u and for
+	 copies of a and b. */
+      scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
+	+ 3*GCDEXT_DC_THRESHOLD;
 
-  /* Size and sign of u fields. The largest u should be normalized to
-     this size, and except for the case u1 = 0, that is the latest
-     u. */
-  int rsize;
-  int rsign;
+      if (scratch > talloc)
+	talloc = scratch;
 
-  mp_ptr qp;
-  mp_size_t qsize;
-  mp_size_t ralloc = asize + 1;
-  mp_size_t ualloc = bsize + 1;
-
-  ASSERT (asize >= bsize);
-  ASSERT (bsize > 0);
-
-  ASSERT (MPN_LEQ_P (bp, bsize, ap, asize));
-
-  ASSERT (4 * ralloc + 4*ualloc + asize <= talloc);
-
-  r[0].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[1].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[2].rp = tp; tp += ralloc; talloc -= ralloc;
-  r[3].rp = tp; tp += ralloc; talloc -= ralloc;
-
-  /* Must zero out the u fields */
-  MPN_ZERO (tp, 4 * ualloc);
-
-  r[0].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
-  r[1].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
-  r[2].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
-  r[3].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
+      /* Cofactors u0 and u1 */
+      talloc += 2*(n+1);
+    }
 
-  qp = tp; tp += asize; talloc -= asize;
+  tp = TMP_ALLOC_LIMBS(talloc);
 
-  ASSERT (asize >= bsize);
-  ASSERT (bsize > 0);
-  MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize;
-  MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize;
+  if (an > n)
+    {
+      mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
 
-  r[0].uvp[0][0] = 1;
-  r[1].uvp[0][0] = 0;
+      if (mpn_zero_p (ap, n))
+	{
+	  MPN_COPY (gp, bp, n);
+	  *usizep = 0;
+	  TMP_FREE;
+	  return n;
+	}
+    }
 
-  /* We don't use the v fields. */
-  rsize = 1;
-  rsign = 0;
+  if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
+    {
+      mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
 
-  scratch = mpn_hgcd_init_itch ((asize + 1) / 2);
-  ASSERT (scratch <= talloc);
-  mpn_hgcd_init (&hgcd, (asize + 1) / 2, tp);
-  tp += scratch; talloc -= scratch;
+      TMP_FREE;
+      return gn;
+    }
+  
+  MPN_ZERO (tp, 2*ualloc);
+  u0 = tp; tp += ualloc;
+  u1 = tp; tp += ualloc;
 
   {
-    mp_size_t nlimbs = qstack_itch ((asize + 1) / 2);
+    /* For the first hgcd call, there are no u updates, and it makes
+       some sense to use a different choice for p. */
+
+    /* FIXME: We could trim use of temporary storage, since u0 and u1
+       are not used yet. For the hgcd call, we could swap in the u0
+       and u1 pointers for the relevant matrix elements. We could also
+       use a specialized hgcd function which computes only the last
+       two elements of the matrix. */
+
+    struct hgcd_matrix M;
+    mp_size_t p = CHOOSE_P_1 (n); /* Same as for gcd. */
+    mp_size_t nn;
+
+    mpn_hgcd_matrix_init (&M, n - p, tp);
+    nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
+    if (nn > 0)
+      {
+	ASSERT (M.n <= (n - p - 1)/2);
+	ASSERT (M.n + p <= (p + n - 1) / 2);
 
-    ASSERT (nlimbs <= talloc);
-    qstack_init (&quotients, (asize + 1) / 2, tp, nlimbs);
+	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
+	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
 
-    tp += nlimbs;
-    talloc -= nlimbs;
-    scratch += nlimbs;
+	MPN_COPY (u0, M.p[1][0], M.n);
+	MPN_COPY (u1, M.p[1][1], M.n);
+	un = M.n;
+	while ( (u0[un-1] | u1[un-1] ) == 0)
+	  un--;
+      }
+    else
+      {
+	/* mpn_hgcd has failed. Then either one of a or b is very
+	   small, or the difference is very small. Perform one
+	   subtraction followed by one division. */
+	mp_size_t gn;
+	mp_size_t updated_un = 1;
+
+	u1[0] = 1;
+	
+	/* Temporary storage n + 1 */
+	n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
+				    u0, u1, &updated_un, tp);
+	if (n == 0)
+	  {
+	    TMP_FREE;
+	    return gn;
+	  }
+
+	un = updated_un;
+	ASSERT (un < ualloc);
+      }
   }
-
-  while (ABOVE_THRESHOLD (r[0].rsize, GCDEXT_SCHOENHAGE_THRESHOLD)
-	 && r[1].rsize > 0)
+  
+  while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
     {
-      mp_size_t k = r[0].rsize / 2;
-      int res;
-
-      ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r);
-      ASSERT_ROW (ap, asize, bp, bsize, ~rsign, rsize, r + 1);
-
-      if (r[1].rsize <= k)
-	goto euclid;
+      struct hgcd_matrix M;
+      mp_size_t p = CHOOSE_P_2 (n);
+      mp_size_t nn;
 
-      qstack_reset (&quotients, r[0].rsize - k);
-
-      res = mpn_hgcd (&hgcd,
-		      r[0].rp + k, r[0].rsize - k,
-		      r[1].rp + k, r[1].rsize - k,
-		      &quotients,
-		      tp, talloc);
-
-      if (res == 0 || res == 1)
+      mpn_hgcd_matrix_init (&M, n - p, tp);
+      nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
+      if (nn > 0)
 	{
-	euclid:
-	  qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize,
-			     r[0].rp, r[0].rsize,
-			     r[1].rp, r[1].rsize);
-	  rsize = hgcd_update_u (r, rsize, qp, qsize, ualloc);
-	  ASSERT (rsize < ualloc);
-
-	  ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r + 2);
-
-	  HGCD_SWAP3_LEFT (r);
-	  rsign = ~rsign;
+	  mp_size_t n0, n1;
+	  mp_ptr t0;
+	  mp_ptr t1;
+	  
+	  t0 = tp + matrix_scratch;
+	  ASSERT (M.n <= (n - p - 1)/2);
+	  ASSERT (M.n + p <= (p + n - 1) / 2);
+
+	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
+	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
+
+	  t1 = t0 + un;
+
+	  /* FIXME: This copying could be avoided by some swapping of
+	   * pointers. May need more temporary storage, though. */
+	  MPN_COPY (t0, u0, un);
+	  MPN_COPY (t1, u1, un);
+
+	  /* By the same analysis as for mpn_hgcd_matrix_mul */
+	  ASSERT (M.n + un <= ualloc);
+
+	  /* Temporary storage un */
+	  n0 = addmul2_n (u0, t0, t1, un,
+			  M.p[0][0], M.p[1][0], M.n, t1 + un);
+	  n1 = addmul2_n (u1, t0, t1, un,
+			  M.p[0][1], M.p[1][1], M.n, t1 + un);
+	  
+	  if (n0 > un)
+	    un = n0;
+	  if (n1 > un)
+	    un = n1;
+
+	  ASSERT (un < ualloc);
+	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
 	}
       else
 	{
-	  const struct hgcd_row *s = hgcd.row + (res - 2);
-	  int sign = hgcd.sign;
-	  if (res == 3)
-	    sign = ~sign;
-
-	  /* s[0] and s[1] are correct */
-	  r[2].rsize
-	    = mpn_hgcd_fix (k, r[2].rp, ralloc,
-			    sign, hgcd.size, s,
-			    r[0].rp, r[1].rp,
-			    tp, talloc);
-
-	  r[3].rsize
-	    = mpn_hgcd_fix (k, r[3].rp, ralloc,
-			    ~sign, hgcd.size, s+1,
-			    r[0].rp, r[1].rp,
-			    tp, talloc);
-
-	  rsize = hgcd_mul_vector (r + 2, ualloc, s, hgcd.size,
-				   r, rsize, tp, talloc);
-	  ASSERT (rsize < ualloc);
-
-	  rsign ^= sign;
-	  ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r + 2);
-	  ASSERT_ROW (ap, asize, bp, bsize, ~rsign, rsize, r + 3);
-
-	  HGCD_SWAP4_2 (r);
+	  /* mpn_hgcd has failed. Then either one of a or b is very
+	     small, or the difference is very small. Perform one
+	     subtraction followed by one division. */
+	  mp_size_t gn;
+	  mp_size_t updated_un = un;
+
+	  /* Temporary storage n + 1 */
+	  n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
+				      u0, u1, &updated_un, tp);
+	  if (n == 0)
+	    {
+	      TMP_FREE;
+	      return gn;
+	    }
+
+	  un = updated_un;
+	  ASSERT (un < ualloc);
 	}
     }
-  if (r[1].rsize == 0)
+  
+  if (mpn_zero_p (ap, n))
     {
-      MPN_COPY (gp, r[0].rp, r[0].rsize);
-      MPN_NORMALIZE (r[0].uvp[0], rsize);
-      MPN_COPY (up, r[0].uvp[0], rsize);
+      MPN_COPY (gp, bp, n);
+      MPN_NORMALIZE (u0, un);
+      MPN_COPY (up, u0, un);
+      *usizep = -un;
 
-      *usizep = (rsign >= 0) ? rsize : - rsize;
-      return r[0].rsize;
+      TMP_FREE;
+      return n;
     }
-  else if (r[0].rsize == 1)
+  else if (mpn_zero_p (bp, n))
     {
-      mp_limb_t u;
-      mp_limb_t v;
-      mp_limb_t cy;
-
-      gp[0] = gcdext_1 (&u, &v, r[0].rp[0], r[1].rp[0]);
+      MPN_COPY (gp, ap, n);
+      MPN_NORMALIZE (u1, un);
+      MPN_COPY (up, u1, un);
+      *usizep = un;
 
-      /* g = u r0 + v r1 = (u u0 + v u1) a + (...) b */
-      cy = mpn_addmul2_n_1 (up, rsize,
-			    r[0].uvp[0], u,
-			    r[1].uvp[0], v);
-
-      rsize++;
-      if (cy)
-	up[rsize++] = cy;
-      else
-	MPN_NORMALIZE (up, rsize);
+      TMP_FREE;
+      return n;
+    }
+  else if (mpn_zero_p (u0, un))
+    {
+      mp_size_t gn;
+      ASSERT (un == 1);
+      ASSERT (u1[0] == 1);
 
-      *usizep = (rsign >= 0) ? rsize : -rsize;
-      return 1;
+      /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
+      gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
 
+      TMP_FREE;
+      return gn;
     }
   else
     {
-      /* We have r0 = u0 a + v0 b,
-		 r1 = u1 a + v1 b
+      /* We have A = ... a + ... b
+		 B =  u0 a +  u1 b
+		 
+		 a = u1  A + ... B
+		 b = -u0 A + ... B
 
-	 Compute g = u r0 + v r1 = (u u0 + v u1) a + (...) b
-	 In the expression (u u0 + v u1), we have
+         with bounds
 
-	 u  <= r1,
-	 u0 <= b/r0 (except if r0 = a, which should never be the case here)
-	 v  <= r0
-	 u1 <= b/r0
-      */
+	   |u0|, |u1| <= B / min(a, b)
+	 
+	 Compute g = u a + v b = (u u1 - v u0) A + (...) B
+	 Here, u, v are bounded by
 
-      mp_size_t gsize;
-      mp_size_t usize;
-      mp_size_t vsize;
-
-      /* u1 should be non-zero, and normalized */
-      ASSERT (rsize);
-      ASSERT (r[1].uvp[0][rsize - 1] != 0);
-#if WANT_TRACE
-      trace ("gcdext: \n"
-	     "r0 = %Nd\n"
-	     "r1 = %Nd\n"
-	     "u0 = %Nd\n"
-	     "u1 = %Nd\n",
-	     r[0].rp, r[0].rsize, r[1].rp, r[1].rsize,
-	     r[0].uvp[0], rsize, r[1].uvp[0], rsize);
-#endif
-      /* We don't need the space for hgcd and the quotient stack any more */
-      tp -= scratch; talloc += scratch;
-
-      /* Stores u in r[2] and v in r[3] */
-      gsize = gcdext_lehmer (gp, r[2].uvp[0], &usize,
-			     r[0].rp, r[0].rsize,
-			     r[1].rp, r[1].rsize,
-			     tp, talloc);
+	 |u| <= b,
+	 |v| <= a
+      */
 
-      if (usize == 0)
+      mp_size_t u0n;
+      mp_size_t u1n;
+      mp_size_t lehmer_un;
+      mp_size_t lehmer_vn;
+      mp_size_t gn;
+      
+      mp_ptr lehmer_up;
+      mp_ptr lehmer_vp;
+      int negate;
+
+      lehmer_up = tp; tp += n;
+
+      /* Call mpn_gcdext_lehmer_n with copies of a and b. */
+      MPN_COPY (tp, ap, n);
+      MPN_COPY (tp + n, bp, n);
+      gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
+
+      u0n = un;
+      MPN_NORMALIZE (u0, u0n);
+      if (lehmer_un == 0)
 	{
-	  /* u == 0  ==>  v = g / b == 1  ==> g = u1 a + (...) b */
+	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
+	  MPN_COPY (up, u0, u0n);
+	  *usizep = -u0n;
 
-	  MPN_NORMALIZE (r[1].uvp[0], rsize);
-	  MPN_COPY (up, r[1].uvp[0], rsize);
-	  *usizep = (rsign >= 0) ? - rsize : rsize;
-
-	  return gsize;
+	  TMP_FREE;
+	  return gn;
 	}
 
-      /* Compute v = (g - s r0) / r1, storing it in r[3] */
-      vsize = compute_v (r[3].uvp[0], ualloc,
-			 r[0].rp, r[0].rsize, r[1].rp, r[1].rsize,
-			 gp, gsize,
-			 r[2].uvp[0], usize,
-			 tp, talloc);
+      lehmer_vp = tp;
+      /* Compute v = (g - u a) / b */
+      lehmer_vn = compute_v (lehmer_vp, 
+			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
 
-      if (usize < 0)
+      if (lehmer_un > 0)
+	negate = 0;
+      else
 	{
-	  usize = - usize;
-	  rsign = ~rsign;
+	  lehmer_un = -lehmer_un;
+	  negate = 1;
 	}
 
-      /* It's possible that u0 = 0, u1 = 1 */
-      if (rsize == 1 && r[0].uvp[0][0] == 0)
-	{
-	  /* u0 == 0 ==> u u0 + v u1 = v */
-	  MPN_COPY (up, r[3].uvp[0], vsize);
-	  *usizep = (rsign >= 0) ? vsize : - vsize;
+      u1n = un;
+      MPN_NORMALIZE (u1, u1n);
 
-	  return gsize;
+      /* It's possible that u0 = 1, u1 = 0 */
+      if (u1n == 0)
+	{
+	  ASSERT (un == 1);
+	  ASSERT (u0[0] == 1);
+	  
+	  /* u1 == 0 ==> u u1 + v u0 = v */
+	  MPN_COPY (up, lehmer_vp, lehmer_vn);
+	  *usizep = negate ? lehmer_vn : - lehmer_vn;
+
+	  TMP_FREE;
+	  return gn;
 	}
 
-      /* Ok, now u0, u1, u are non-zero. We may still have v == 0 */
-      ASSERT (usize + rsize <= ualloc);
-      ASSERT (vsize + rsize <= ualloc);
+      ASSERT (lehmer_un + u1n <= ualloc);
+      ASSERT (lehmer_vn + u0n <= ualloc);
+
+      /* Now u0, u1, u are non-zero. We may still have v == 0 */
 
       /* Compute u u0 */
-      if (usize <= rsize)
+      if (lehmer_un <= u1n)
 	/* Should be the common case */
-	mpn_mul (up,
-		 r[0].uvp[0], rsize,
-		 r[2].uvp[0], usize);
+	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
       else
-	mpn_mul (up,
-		 r[2].uvp[0], usize,
-		 r[0].uvp[0], rsize);
+	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
 
-      usize += rsize;
+      un = u1n + lehmer_un;
+      un -= (up[un - 1] == 0);
 
-      /* There may be more than one zero limb, if #u0 < #u1 */
-      MPN_NORMALIZE (up, usize);
-      ASSERT (usize < ualloc);
-
-      if (vsize)
+      if (lehmer_vn > 0)
 	{
 	  mp_limb_t cy;
 
-	  /* Overwrites old r[2].uvp[0] value */
-	  if (vsize <= rsize)
+	  /* Overwrites old u1 value */
+	  if (lehmer_vn <= u0n)
 	    /* Should be the common case */
-	    cy = mpn_mul (r[2].uvp[0],
-			  r[1].uvp[0], rsize,
-			  r[3].uvp[0], vsize);
+	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
 	  else
-	    cy = mpn_mul (r[2].uvp[0],
-			  r[3].uvp[0], vsize,
-			  r[1].uvp[0], rsize);
+	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
 
-	  vsize += rsize - (cy == 0);
-	  ASSERT (vsize < ualloc);
+	  u1n = u0n + lehmer_vn;
+	  u1n -= (u1[u1n - 1] == 0);
 
-	  if (vsize <= usize)
-	    cy = mpn_add (up, up, usize, r[2].uvp[0], vsize);
+	  if (u1n <= un)
+	    {
+	      cy = mpn_add (up, up, un, u1, u1n);
+	    }
 	  else
 	    {
-	      cy = mpn_add (up, r[2].uvp[0], vsize, up, usize);
-	      usize = vsize;
+	      cy = mpn_add (up, u1, u1n, up, un);
+	      un = u1n;
 	    }
-	  up[usize] = cy;
-	  usize += (cy != 0);
+	  up[un] = cy;
+	  un += (cy != 0);
 
-	  ASSERT (usize < ualloc);
+	  ASSERT (un < ualloc);
 	}
-      *usizep = (rsign >= 0) ? usize : -usize;
+      *usizep = negate ? -un : un;
 
-      return gsize;
-    }
-}
-
-mp_size_t
-mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
-	    mp_ptr ap, mp_size_t asize, mp_ptr bp, mp_size_t bsize)
-{
-  ASSERT (asize >= bsize);
-  ASSERT (bsize > 0);
-
-  if (asize == 1)
-    {
-#if GCDEXT_1_USE_BINARY
-      mp_limb_t v;
-      *gp = gcdext_1 (up, &v, ap[0], bp[0]);
-#else
-      *gp = gcdext_1_u (up, ap[0], bp[0]);
-#endif
-      *usizep = (up[0] != 0);
-      ASSERT(gp[0] != 0);
-      return 1;
-    }
-  else if (BELOW_THRESHOLD (asize, GCDEXT_SCHOENHAGE_THRESHOLD))
-    {
-      mp_size_t gsize;
-      mp_ptr tp;
-      mp_size_t talloc = gcdext_lehmer_itch (asize, bsize);
-      TMP_DECL;
-      TMP_MARK;
-
-      tp = TMP_ALLOC_LIMBS (talloc);
-      gsize = gcdext_lehmer (gp, up, usizep, ap, asize, bp, bsize,
-			     tp, talloc);
-      TMP_FREE;
-      return gsize;
-    }
-  else
-    {
-      mp_size_t gsize;
-      mp_ptr tp;
-      mp_size_t talloc = gcdext_schoenhage_itch (asize, bsize);
-      TMP_DECL;
-      TMP_MARK;
-
-      tp = TMP_ALLOC_LIMBS (talloc);
-      gsize = gcdext_schoenhage (gp, up, usizep, ap, asize, bp, bsize,
-				 tp, talloc);
       TMP_FREE;
-      return gsize;
+      return gn;
     }
 }
diff --git a/mpn/generic/gcdext_1.c b/mpn/generic/gcdext_1.c
new file mode 100644
index 000000000..efade2b4c
--- /dev/null
+++ b/mpn/generic/gcdext_1.c
@@ -0,0 +1,319 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+/* Default to binary gcdext_1, since it is best on most current machines.
+   We should teach tuneup to choose the right gcdext_1.  */
+#define GCDEXT_1_USE_BINARY 1
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef NULL
+# define NULL ((void *) 0)
+#endif
+
+/* FIXME: Takes two single-word limbs. It could be extended to a
+ * function that accepts a bignum for the first input, and only
+ * returns the first co-factor. */
+
+/* Returns g, u and v such that g = u A - v B. There are three
+   different cases for the result:
+
+     g = u A - v B, 0 < u < b, 0 < v < a
+     g = A          u = 1, v = 0
+     g = B          u = B, v = A - 1
+
+   We always return with 0 < u <= b, 0 <= v < a.
+*/
+#if GCDEXT_1_USE_BINARY
+
+static mp_limb_t
+gcdext_1_odd (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
+{
+  mp_limb_t u0;
+  mp_limb_t v0;
+  mp_limb_t v1;
+  mp_limb_t u1;
+
+  mp_limb_t B = b;
+  mp_limb_t A = a;
+
+  /* Through out this function maintain
+
+     a = u0 A - v0 B
+     b = u1 A - v1 B
+
+     where A and B are odd. */
+
+  u0 = 1; v0 = 0;
+  u1 = b; v1 = a-1;
+
+  if (A == 1)
+    {
+      *up = u0; *vp = v0;
+      return 1;
+    }
+  else if (B == 1)
+    {
+      *up = u1; *vp = v1;
+      return 1;
+    }
+
+  while (a != b)
+    {
+      mp_limb_t mask;
+
+      ASSERT (a % 2 == 1);
+      ASSERT (b % 2 == 1);
+
+      ASSERT (0 < u0); ASSERT (u0 <= B);
+      ASSERT (0 < u1); ASSERT (u1 <= B);
+
+      ASSERT (0 <= v0); ASSERT (v0 < A);
+      ASSERT (0 <= v1); ASSERT (v1 < A);
+
+      if (a > b)
+	{
+	  MP_LIMB_T_SWAP (a, b);
+	  MP_LIMB_T_SWAP (u0, u1);
+	  MP_LIMB_T_SWAP (v0, v1);
+	}
+
+      ASSERT (a < b);
+
+      /* Makes b even */
+      b -= a;
+
+      mask = - (mp_limb_t) (u1 < u0);
+      u1 += B & mask;
+      v1 += A & mask;
+      u1 -= u0;
+      v1 -= v0;
+
+      ASSERT (b % 2 == 0);
+
+      do
+	{
+	  /* As b = u1 A + v1 B is even, while A and B are odd,
+	     either both or none of u1, v1 is even */
+
+	  ASSERT (u1 % 2 == v1 % 2);
+
+	  mask = -(u1 & 1);
+	  u1 = u1 / 2 + ((B / 2) & mask) - mask;
+	  v1 = v1 / 2 + ((A / 2) & mask) - mask;
+
+	  b /= 2;
+	}
+      while (b % 2 == 0);
+    }
+
+  /* Now g = a = b */
+  ASSERT (a == b);
+  ASSERT (u1 <= B);
+  ASSERT (v1 < A);
+
+  ASSERT (A % a == 0);
+  ASSERT (B % a == 0);
+  ASSERT (u0 % (B/a) == u1 % (B/a));
+  ASSERT (v0 % (A/a) == v1 % (A/a));
+
+  *up = u0; *vp = v0;
+
+  return a;
+}
+
+mp_limb_t
+mpn_gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
+{
+  unsigned shift = 0;
+  mp_limb_t g;
+  mp_limb_t u;
+  mp_limb_t v;
+
+  /* We use unsigned values in the range 0, ... B - 1. As the values
+     are uniquely determined only modulo B, we can add B at will, to
+     get numbers in range or flip the least significant bit. */
+  /* Deal with powers of two */
+  while ((a | b) % 2 == 0)
+    {
+      a /= 2; b /= 2; shift++;
+    }
+
+  if (b % 2 == 0)
+    {
+      unsigned k = 0;
+
+      do {
+	b /= 2; k++;
+      } while (b % 2 == 0);
+
+      g = gcdext_1_odd (&u, &v, a, b);
+
+      while (k--)
+	{
+	  /* We have g = u a + v b, and need to construct
+	     g = u'a + v'(2b).
+
+	     If v is even, we can just set u' = u, v' = v/2
+	     If v is odd, we can set v' = (v + a)/2, u' = u + b
+	  */
+
+	  if (v % 2 == 0)
+	    v /= 2;
+	  else
+	    {
+	      u = u + b;
+	      v = v/2 + a/2 + 1;
+	    }
+	  b *= 2;
+	}
+    }
+  else if (a % 2 == 0)
+    {
+      unsigned k = 0;
+
+      do {
+	a /= 2; k++;
+      } while (a % 2 == 0);
+
+      g = gcdext_1_odd (&u, &v, a, b);
+
+      while (k--)
+	{
+	  /* We have g = u a + v b, and need to construct
+	     g = u'(2a) + v'b.
+
+	     If u is even, we can just set u' = u/2, v' = v.
+	     If u is odd, we can set u' = (u + b)/2
+	  */
+
+	  if (u % 2 == 0)
+	    u /= 2;
+	  else
+	    {
+	      u = u/2 + b/2 + 1;
+	      v = v + a;
+	    }
+	  a *= 2;
+	}
+    }
+  else
+    /* Ok, both are odd */
+    g = gcdext_1_odd (&u, &v, a, b);
+
+  *up = u;
+  *vp = v;
+
+  return g << shift;
+}
+
+#else /* ! GCDEXT_1_USE_BINARY */
+static mp_limb_t
+gcdext_1_u (mp_limb_t *up, mp_limb_t a, mp_limb_t b)
+{
+  /* Maintain
+
+     a =   u0 A mod B
+     b = - u1 A mod B
+  */
+  mp_limb_t u0 = 1;
+  mp_limb_t u1 = 0;
+  mp_limb_t B = b;
+
+  ASSERT (a >= b);
+  ASSERT (b > 0);
+
+  for (;;)
+    {
+      mp_limb_t q;
+
+      q = a / b;
+      a -= q * b;
+
+      if (a == 0)
+	{
+	  *up = B - u1;
+	  return b;
+	}
+      u0 += q * u1;
+
+      q = b / a;
+      b -= q * a;
+
+      if (b == 0)
+	{
+	  *up = u0;
+	  return a;
+	}
+      u1 += q * u0;
+    }
+}
+
+mp_limb_t
+mpn_gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
+{
+  /* Maintain
+
+     a =   u0 A - v0 B
+     b = - u1 A + v1 B = (B - u1) A - (A - v1) B
+  */
+  mp_limb_t u0 = 1;
+  mp_limb_t v0 = 0;
+  mp_limb_t u1 = 0;
+  mp_limb_t v1 = 1;
+
+  mp_limb_t A = a;
+  mp_limb_t B = b;
+
+  ASSERT (a >= b);
+  ASSERT (b > 0);
+
+  for (;;)
+    {
+      mp_limb_t q;
+
+      q = a / b;
+      a -= q * b;
+
+      if (a == 0)
+	{
+	  *up = B - u1;
+	  *vp = A - v1;
+	  return b;
+	}
+      u0 += q * u1;
+      v0 += q * v1;
+
+      q = b / a;
+      b -= q * a;
+
+      if (b == 0)
+	{
+	  *up = u0;
+	  *vp = v0;
+	  return a;
+	}
+      u1 += q * u0;
+      v1 += q * v0;
+    }
+}
+#endif /* ! GCDEXT_1_USE_BINARY */
diff --git a/mpn/generic/gcdext_lehmer.c b/mpn/generic/gcdext_lehmer.c
new file mode 100644
index 000000000..34a503d19
--- /dev/null
+++ b/mpn/generic/gcdext_lehmer.c
@@ -0,0 +1,162 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Temporary storage: 2*(n+1) for u. n+1 for the matrix-vector
+   multiplications (if hgcd2 succeeds). If hgcd fails, n+1 limbs are
+   needed for the division, with most n for the quotient, and n+1 for
+   the product q u0. In all, 4n + 3. */
+
+mp_size_t
+mpn_gcdext_lehmer_n (mp_ptr gp, mp_ptr up, mp_size_t *usize,
+		     mp_ptr ap, mp_ptr bp, mp_size_t n,
+		     mp_ptr tp)
+{
+  mp_size_t ualloc = n + 1;
+
+  /* Keeps track of the second row of the reduction matrix
+   *
+   *   M = (v0, v1 ; u0, u1)
+   *
+   * which correspond to the first column of the inverse
+   *
+   *   M^{-1} = (u1, -v1; -u0, v0)
+   */
+
+  mp_size_t un;
+  mp_ptr u0;
+  mp_ptr u1;
+
+  MPN_ZERO (tp, 2*ualloc);
+  u0 = tp; tp += ualloc;
+  u1 = tp; tp += ualloc;
+
+  u1[0] = 1; un = 1;
+
+  /* FIXME: Handle n == 2 differently, after the loop? */
+  while (n >= 2)
+    {
+      struct hgcd_matrix1 M;
+      mp_limb_t ah, al, bh, bl;
+      mp_limb_t mask;
+
+      mask = ap[n-1] | bp[n-1];
+      ASSERT (mask > 0);
+
+      if (mask & GMP_NUMB_HIGHBIT)
+	{
+	  ah = ap[n-1]; al = ap[n-2];
+	  bh = bp[n-1]; bl = bp[n-2];
+	}
+      else if (n == 2)
+	{
+	  /* We use the full inputs without truncation, so we can
+	     safely shift left. */
+	  int shift;
+
+	  count_leading_zeros (shift, mask);
+	  ah = MPN_EXTRACT_NUMB (shift, ap[1], ap[0]);
+	  al = ap[0] << shift;
+	  bh = MPN_EXTRACT_NUMB (shift, bp[1], bp[0]);
+	  bl = bp[0] << shift;	  
+	}
+      else
+	{
+	  int shift;
+
+	  count_leading_zeros (shift, mask);
+	  ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+	  al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+	  bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+	  bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
+	}
+
+      /* Try an mpn_nhgcd2 step */
+      if (mpn_hgcd2 (ah, al, bh, bl, &M))
+	{
+	  n = mpn_hgcd_mul_matrix1_inverse_vector (&M, n, ap, bp, tp);
+	  un = mpn_hgcd_mul_matrix1_vector(&M, un, u0, u1, tp);
+	}
+      else
+	{
+	  /* mpn_hgcd2 has failed. Then either one of a or b is very
+	     small, or the difference is very small. Perform one
+	     subtraction followed by one division. */
+	  mp_size_t gn;
+	  mp_size_t updated_un = un;
+
+	  /* Temporary storage n + 1 */
+	  n = mpn_gcdext_subdiv_step (gp, &gn, up, usize, ap, bp, n,
+				      u0, u1, &updated_un, tp);
+	  if (n == 0)
+	    return gn;
+
+	  un = updated_un;
+	}
+    }
+  if (ap[0] == 0)
+    {
+      gp[0] = bp[0];
+
+      MPN_NORMALIZE (u0, un);
+      MPN_COPY (up, u0, un);
+
+      *usize = -un;
+      return 1;
+    }
+  else if (bp[0] == 0)
+    {
+      gp[0] = ap[0];
+
+      MPN_NORMALIZE (u1, un);
+      MPN_COPY (up, u1, un);
+
+      *usize = un;
+      return 1;
+    }
+  else
+    {
+      mp_limb_t uh, vh;
+      mp_limb_t u;
+      mp_limb_t v;
+
+      gp[0] = mpn_gcdext_1 (&u, &v, ap[0], bp[0]);
+
+      /* Set up = u u1 + v u0. Keep track of size, un grows by one or
+	 two limbs. */
+      uh = mpn_mul_1 (up, u1, un, u);
+      vh = mpn_addmul_1 (up, u0, un, v);
+
+      if ( (uh | vh) > 0)
+	{
+	  mp_limb_t cy;
+	  uh += vh;
+	  up[un++] = uh;
+	  if (uh < vh)
+	    up[un++] = 1;
+	}
+
+      *usize = un;
+      return 1;
+    }
+}
diff --git a/mpn/generic/gcdext_subdiv_step.c b/mpn/generic/gcdext_subdiv_step.c
new file mode 100644
index 000000000..8a4ba1f42
--- /dev/null
+++ b/mpn/generic/gcdext_subdiv_step.c
@@ -0,0 +1,188 @@
+/* gcdext_subdiv_step.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static inline int
+mpn_zero_p (mp_srcptr ap, mp_size_t n)
+{
+  mp_size_t i;
+  for (i = n - 1; i >= 0; i--)
+    {
+      if (ap[i] != 0)
+	return 0;
+    }
+  return 1;
+}
+
+/* Used when mpn_hgcd or mpn_hgcd2 has failed. Then either one of a or
+   b is small, or the difference is small. Perform one subtraction
+   followed by one division. If the gcd is found, stores it in gp and
+   *gn, and returns zero. Otherwise, compute the reduced a and b,
+   return the new size, and cofactors. */
+
+/* Temporary storage: Let N be a bound both for the inputs a, b, and
+   the cofactors u0, u1 after the division step. Then up to N is
+   needed for the quotient, and N+1 for the product q u0. All in all,
+   2N + 1. */
+mp_size_t
+mpn_gcdext_subdiv_step (mp_ptr gp, mp_size_t *gn, mp_ptr up, mp_size_t *usizep,
+			mp_ptr ap, mp_ptr bp, mp_size_t n,
+			mp_ptr u0, mp_ptr u1, mp_size_t *unp, mp_ptr tp)
+
+{
+  mp_size_t an, bn, un;
+  mp_size_t qn;
+  mp_size_t u0n;
+
+  int swapped;
+
+  an = bn = n;
+	  
+  ASSERT (an > 0);
+  ASSERT (ap[an-1] > 0 || bp[an-1] > 0);
+
+  MPN_NORMALIZE (ap, an);
+  MPN_NORMALIZE (bp, bn);
+
+  un = *unp;
+
+  swapped = 0;
+  
+  if (UNLIKELY (an == 0))
+    {
+    return_b:
+      MPN_COPY (gp, bp, bn);
+      *gn = bn;
+
+      MPN_NORMALIZE (u0, un);
+      MPN_COPY (up, u0, un);
+
+      *usizep = swapped ? un : -un;
+      
+      return 0;
+    }
+  else if (UNLIKELY (bn == 0))
+    {
+    return_a:
+      MPN_COPY (gp, ap, an);
+      *gn = an;
+      
+      MPN_NORMALIZE (u1, un);
+      MPN_COPY (up, u1, un);
+
+      *usizep = swapped ? -un : un;
+      
+      return 0;
+    }
+
+  /* Arrange so that a > b, subtract an -= bn, and maintain
+     normalization. */
+  if (an < bn)
+    {
+      MPN_PTR_SWAP (ap, an, bp, bn);
+      MP_PTR_SWAP (u0, u1);
+      swapped ^= 1;
+    }
+  else if (an == bn)
+    {
+      int c;
+      MPN_CMP (c, ap, bp, an);
+      if (UNLIKELY (c == 0))
+	goto return_a;
+      else if (c < 0)
+	{
+	  MP_PTR_SWAP (ap, bp);
+	  MP_PTR_SWAP (u0, u1);
+	  swapped ^= 1;
+	}
+    }
+  /* Reduce a -= b, u1 += u0 */
+  ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn));
+  MPN_NORMALIZE (ap, an);
+  ASSERT (an > 0);
+
+  u1[un] = mpn_add_n (u1, u1, u0, un);
+  un += (u1[un] > 0);
+
+  /* Arrange so that a > b, and divide a = q b + r */
+  if (an < bn)
+    {
+      MPN_PTR_SWAP (ap, an, bp, bn);
+      MP_PTR_SWAP (u0, u1);
+      swapped ^= 1;
+    }
+  else if (an == bn)
+    {
+      int c;
+      MPN_CMP (c, ap, bp, an);
+      if (UNLIKELY (c == 0))
+	goto return_a;
+      else if (c < 0)
+	{
+	  MP_PTR_SWAP (ap, bp);
+	  MP_PTR_SWAP (u0, u1);
+	  swapped ^= 1;
+	}
+    }
+
+  /* Reduce a -= q b, u1 += q u0 */
+  qn = an - bn + 1;
+  mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
+
+  if (mpn_zero_p (ap, bn))
+    goto return_b;
+
+  n = bn;
+
+  /* Update u1 += q u0 */
+  u0n = un;
+  MPN_NORMALIZE (u0, u0n);
+
+  if (u0n > 0)
+    {
+      qn -= (tp[qn - 1] == 0);
+
+      if (qn > u0n)
+	mpn_mul (tp + qn, tp, qn, u0, u0n);
+      else
+	mpn_mul (tp + qn, u0, u0n, tp, qn);
+
+      if (qn + u0n > un)
+	{
+	  ASSERT_NOCARRY (mpn_add (u1, tp + qn, qn + u0n, u1, un));
+	  un = qn + u0n;
+	  un -= (u1[un-1] == 0);
+	}
+      else
+	{
+	  u1[un] = mpn_add (u1, u1, un, tp + qn, qn + u0n);
+	  un += (u1[un] > 0);
+	}
+    }
+
+  *unp = un;
+  return n;
+}
diff --git a/mpn/generic/hgcd.c b/mpn/generic/hgcd.c
index 8f1967b32..ae8053d77 100644
--- a/mpn/generic/hgcd.c
+++ b/mpn/generic/hgcd.c
@@ -4,7 +4,7 @@
    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
 
-Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -21,2125 +21,624 @@ License for more details.
 You should have received a copy of the GNU Lesser General Public License
 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 
-#define WANT_TRACE 0
-
-#if WANT_TRACE
-# include <stdio.h>
-# include <stdarg.h>
-#endif
-
 #include "gmp.h"
 #include "gmp-impl.h"
 #include "longlong.h"
 
-#if WANT_TRACE
-static void
-trace (const char *format, ...)
+/* For input of size n, matrix elements are of size at most ceil(n/2)
+   - 1, but we need two limbs extra. */
+void
+mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p)
 {
-  va_list args;
-  va_start (args, format);
-  gmp_vfprintf (stderr, format, args);
-  va_end (args);
+  mp_size_t s = (n+1)/2 + 1;
+  M->alloc = s;
+  M->n = 1;
+  MPN_ZERO (p, 4 * s);
+  M->p[0][0] = p;
+  M->p[0][1] = p + s;
+  M->p[1][0] = p + 2 * s;
+  M->p[1][1] = p + 3 * s;
+
+  M->p[0][0][0] = M->p[1][1][0] = 1;
 }
-#endif
-
-/* Comparison of _normalized_ numbers. */
-
-#define MPN_EQUAL_P(ap, asize, bp, bsize)			\
-((asize) == (bsize) && mpn_cmp ((ap), (bp), (asize)) == 0)
-
-#define MPN_LEQ_P(ap, asize, bp, bsize)				\
-((asize) < (bsize) || ((asize) == (bsize)			\
-		       && mpn_cmp ((ap), (bp), (asize)) <= 0))
-
-#define MPN_LESS_P(ap, asize, bp, bsize)			\
-((asize) < (bsize) || ((asize) == (bsize)			\
-		       && mpn_cmp ((ap), (bp), (asize)) < 0))
 
-/* Extract one limb, shifting count bits left
-    ________  ________
-   |___xh___||___xl___|
-	  |____r____|
-   >count <
-
-   The count includes any nail bits, so it should work fine if
-   count is computed using count_leading_zeros.
-*/
-
-#define MPN_EXTRACT_LIMB(count, xh, xl)				\
-  ((((xh) << ((count) - GMP_NAIL_BITS)) & GMP_NUMB_MASK) |	\
-   ((xl) >> (GMP_LIMB_BITS - (count))))
-
-
-/* Return -1 if a < x + y + z,
-	   0 if a = x + y + z,
-	   1 if a > x + y + z. */
-static int
-mpn_cmp_sum3 (mp_srcptr ap, mp_size_t an,
-	      mp_srcptr xp, mp_size_t xn,
-	      mp_srcptr yp, mp_size_t yn,
-	      mp_srcptr zp, mp_size_t zn)
+/* Updated column COL, adding in column (1-COL). */
+static void
+hgcd_matrix_update_1 (struct hgcd_matrix *M, unsigned col)
 {
-  mp_limb_t cy;
+  mp_limb_t c0, c1;
+  ASSERT (col < 2);
 
-  /* Check that all limbs beyond an are zero. This should be slightly
-     cheaper than fully normalizing all the input numbers. */
+  c0 = mpn_add_n (M->p[0][col], M->p[0][0], M->p[0][1], M->n);
+  c1 = mpn_add_n (M->p[1][col], M->p[1][0], M->p[1][1], M->n);
 
-  while (xn > an)
-    if (xp[--xn] > 0) return -1;
-  while (yn > an)
-    if (yp[--yn] > 0) return -1;
-  while (zn > an)
-    if (zp[--zn] > 0) return -1;
+  M->p[0][col][M->n] = c0;
+  M->p[1][col][M->n] = c1;
 
-  /* Start by sorting so that xn >= yn >= zn. Six permutations, so we
-     can't get away with less than three comparisons, at least not for
-     the worst case. */
-
-  if (xn < yn)
-    MPN_SRCPTR_SWAP (xp, xn, yp, yn);
-  if (yn < zn)
-    MPN_SRCPTR_SWAP (yp, yn, zp, zn);
-  if (xn < yn)
-    MPN_SRCPTR_SWAP (xp, xn, yp, yn);
-
-  ASSERT (an >= xn && xn >= yn && yn >= zn);
-
-  /* Assume that a = x + y + z, and write the addition limb by limb.
-
-       (c[1], a[0]) = x[0]   + y[0]   + z[0]   + c[0]
-       (c[2], a[1]) = x[1]   + y[1]   + z[1]   + c[1]
-     (c[k+1], a[k]) = x[k]   + y[k]   + z[k]   + c[2]
-		   ...
-     (c[n], a[n-1]) = x[n-1] + y[n-1] + z[n-1] + c[n-1]
-
-     where the start and stop conditions are that c[0] = c[n] = 0.
-     Then we can start at the high end, iterating
-
-	c[k] = (c[k+1], a[k]) - x[k] - y[k] - z[k]
-
-     If equality holds, then 0 <= c[k] <= 2 for all k (since for
-     example 0xf + 0xf + 0xf + 2 = 0x2f). If we find c[k] < 0, then we
-     know that a < x + y + z, and if we find c[k] > 2, then we know a
-     > x + y + z. */
+  M->n += (c0 | c1) != 0;
+  ASSERT (M->n < M->alloc);
+}
 
-  cy = 0;
+/* Updated column COL, adding in column Q * (1-COL). Temporary
+ * storage: qn + n <= M->alloc, where n is the size of the largest
+ * element in column 1 - COL. */
+static void
+hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn,
+		      unsigned col, mp_ptr tp)
+{
+  ASSERT (col < 2);
 
-  while (an > xn)
+  if (qn == 1)
     {
-      /* c[k] = (c[k+1], a[k]) */
-      if (cy > 0)
-	return 1;
+      mp_limb_t q = qp[0];
+      mp_limb_t c0, c1;
 
-      cy = ap[--an];
-    }
+      c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q);
+      c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q);
 
-#if GMP_NAIL_BITS >= 2
-  while (an > yn)
-    {
-      if (cy > 1)
-	return 1;
+      M->p[0][col][M->n] = c0;
+      M->p[1][col][M->n] = c1;
 
-      cy = (cy << GMP_NUMB_BITS) + ap[--an];
-      if (cy < xp[an])
-	return -1;
-      cy -= xp[an];
+      M->n += (c0 | c1) != 0;
     }
-  while (an > zn)
+  else
     {
-      mp_limb_t s;
+      unsigned row;
 
-      if (cy > 2)
-	return 1;
+      /* Carries for the unlikely case that we get both high words
+	 from the multiplication and carries from the addition. */
+      mp_limb_t c[2];
+      mp_size_t n;
 
-      cy = (cy << GMP_NUMB_BITS ) + ap[--an];
-      s = xp[an] + yp[an];
-      if (cy < s)
-	return -1;
-      cy -= s;
-    }
-  while (an > 0)
-    {
-      mp_limb_t s;
+      /* The matrix will not necessarily grow in size by qn, so we
+	 need normalization in order not to overflow M. */
 
-      if (cy > 2)
-	return 1;
-
-      cy = (cy << GMP_NUMB_BITS ) + ap[--an];
-      s = xp[an] + yp[an] + zp[an];
-      if (cy < s)
-	return -1;
-      cy -= s;
-    }
-#else /* GMP_NAIL_BITS < 2 */
-#if GMP_NAIL_BITS == 1
-loselose
-#endif
-  while (an > yn)
-    {
-      /* c[k] = (c[k+1], a[k]) - x[k] */
-      if (cy > 1)
-	return 1;
+      for (n = M->n; n + qn > M->n; n--)
+	{
+	  ASSERT (n > 0);
+	  if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0)
+	    break;
+	}
+      
+      ASSERT (qn + n <= M->alloc);
 
-      --an;
+      for (row = 0; row < 2; row++)
+	{
+	  if (qn <= n)
+	    mpn_mul (tp, M->p[row][1-col], n, qp, qn);
+	  else
+	    mpn_mul (tp, qp, qn, M->p[row][1-col], n);
 
-      if (cy == 1)
+	  ASSERT (n + qn >= M->n);
+	  c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n);
+	}
+      if (c[0] | c[1])
 	{
-	  if (ap[an] >= xp[an])
-	    return 1;
-	  cy = (ap[an] - xp[an]) & GMP_NUMB_MASK;
+	  M->n = n + qn + 1;
+	  M->p[0][col][n-1] = c[0];
+	  M->p[1][col][n-1] = c[1];
 	}
       else
 	{
-	  /* cy == 0 */
-	  if (ap[an] < xp[an])
-	    return -1;
-	  else
-	    cy = ap[an] - xp[an];
+	  n += qn;
+	  n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0;
+	  if (n > M->n)
+	    M->n = n;
 	}
     }
 
-  while (an > zn)
-    {
-      mp_limb_t sh, sl;
-
-      /* c[k] = (c[k+1], a[k]) - x[k] - y[k] */
-      if (cy > 2)
-	return 1;
-
-      --an;
-
-      sl = xp[an] + yp[an];
-      sh = (sl < xp[an]);
-
-      if (cy < sh || (cy == sh && ap[an] < sl))
-	return -1;
-
-      sl = ap[an] - sl; /* Monkey business */
-      sh = cy - sh - (sl > ap[an]);
-      if (sh > 0)
-	return 1;
-      cy = sl;
-    }
-  while (an > 0)
-    {
-      mp_limb_t sh, sl;
-      if (cy > 2)
-	return 1;
-
-      --an;
-
-      sl = xp[an] + yp[an];
-      sh = (sl < xp[an]);
-
-      sl += zp[an];
-      sh += sl < zp[an];
-
-      if (cy < sh || (cy == sh && ap[an] < sl))
-	return -1;
-      sl = ap[an] - sl; /* Monkey business */
-      sh = cy - sh - (sl > ap[an]);
-      if (sh > 0)
-	return 1;
-      cy = sl;
-    }
-#endif /* GMP_NAIL_BITS < 2 */
-  return cy > 0;
-}
-
-/* Only the first row has v = 0, a = 1 * a + 0 * b */
-static inline int
-hgcd_start_row_p (const struct hgcd_row *r, mp_size_t n)
-{
-  mp_size_t i;
-  mp_srcptr vp = r->uvp[1];
-
-  for (i = 0; i < n; i++)
-    if (vp[i] != 0)
-      return 0;
-
-  return 1;
+  ASSERT (M->n < M->alloc);
 }
 
-/* Called when r[0, 1, 2] >= W^M, r[3] < W^M. Returns the number of
-   remainders that satisfy Jebelean's criterion, i.e. find the largest k
-   such that
-
-     r[k+1] >= max (-u[k+1], - v[k+1])
-
-     r[k] - r[k-1] >= max (u[k+1] - u[k], v[k+1] - v[k])
-
-   Return 0 on failure, i.e. if B or A mod B < W^M. Return 1 in case
-   r0 and r1 are correct, but we still make no progress because r0 =
-   A, r1 = B.
-
-   Otherwise return 2, 3 or 4, the number of r:s that are correct.
- */
-static int
-hgcd_jebelean (const struct hgcd *hgcd, mp_size_t M)
+/* Multiply M by M1 from the right. Since the M1 elements fit in
+   GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs
+   temporary space M->n */
+static void
+hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1,
+		   mp_ptr tp)
 {
-  mp_size_t L;
-  unsigned bit;
-
-  ASSERT (hgcd->row[0].rsize > M);
-  ASSERT (hgcd->row[1].rsize > M);
-  ASSERT (hgcd->row[2].rsize > M);
-  ASSERT (hgcd->row[3].rsize <= M);
-
-  ASSERT (MPN_LESS_P (hgcd->row[1].rp, hgcd->row[1].rsize,
-		      hgcd->row[0].rp, hgcd->row[0].rsize));
-  ASSERT (MPN_LESS_P (hgcd->row[2].rp, hgcd->row[2].rsize,
-		      hgcd->row[1].rp, hgcd->row[1].rsize));
-  ASSERT (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize,
-		      hgcd->row[2].rp, hgcd->row[2].rsize));
-
-  ASSERT (mpn_cmp (hgcd->row[0].uvp[1], hgcd->row[1].uvp[1], hgcd->size) <= 0);
-  ASSERT (mpn_cmp (hgcd->row[1].uvp[1], hgcd->row[2].uvp[1], hgcd->size) <= 0);
-  ASSERT (mpn_cmp (hgcd->row[2].uvp[1], hgcd->row[3].uvp[1], hgcd->size) <= 0);
-
-  /* The bound is really floor (N/2), which is <= M = ceil (N/2) */
-  L = hgcd->size;
-  ASSERT (L <= M);
-
-  ASSERT (L > 0);
-  ASSERT (hgcd->row[3].uvp[1][L - 1] != 0);
-
-  bit = hgcd->sign < 0;
-
-  /* Check r1 - r2 >= max (u2 - u1, v2 - v1) = {|u1| + |u2|, |v1| + |v2|}[bit] */
-
-  if (mpn_cmp_sum3 (hgcd->row[1].rp, hgcd->row[1].rsize,
-		    hgcd->row[2].rp, hgcd->row[2].rsize,
-		    hgcd->row[1].uvp[bit], L,
-		    hgcd->row[2].uvp[bit], L) < 0)
-    return 2 - (hgcd_start_row_p (hgcd->row, hgcd->size));
-
-  /* Ok, r2 is correct */
-
-  /* Check r3 >= max (-u3, -v3) = (|u3|, |v3|)[bit] */
-  if (hgcd->row[3].rsize > L)
-    /* Condition satisfied */
-    ;
-  else
+  unsigned row;
+  mp_limb_t grow;
+  for (row = 0, grow = 0; row < 2; row++)
     {
-      mp_size_t size;
-      for (size = L; size > hgcd->row[3].rsize; size--)
-	{
-	  if (hgcd->row[3].uvp[bit][size-1] != 0)
-	    return 3;
-	}
-      if (mpn_cmp (hgcd->row[3].rp, hgcd->row[3].uvp[bit], size) < 0)
-	return 3;
-    }
+      mp_limb_t c0, c1;
 
-  /* Check r3 - r2 >= max(u3-u2, v3-v2) = {|u2| + |u3|, |v2| +|v3|}[1-bit] */
+      /*�Compute (u, u') <-- (r00 u + r10 u', r01 u + r11 u') as
 
-  if (mpn_cmp_sum3 (hgcd->row[2].rp, hgcd->row[2].rsize,
-		    hgcd->row[3].rp, hgcd->row[3].rsize,
-		    hgcd->row[2].uvp[bit ^ 1], L,
-		    hgcd->row[3].uvp[bit ^ 1], L) < 0)
-    return 3;
-
-  /* Ok, r3 is correct */
-  return 4;
-}
+	  t   = u
+	  u  *= r00
+	  u  += r10 * u'
+	  u' *= r11
+	  u' += r01 * t
+      */
 
+      /* FIXME: Duplication with mpn_hgcd_mul_matrix1_vector. */
+      MPN_COPY (tp, M->p[row][0], M->n);
+      c0 =     mpn_mul_1 (M->p[row][0], M->p[row][0], M->n, M1->u[0][0]);
+      c0 += mpn_addmul_1 (M->p[row][0], M->p[row][1], M->n, M1->u[1][0]);
+      M->p[row][0][M->n] = c0;
 
-/* Compute au + bv. u and v are single limbs, a and b are n limbs each.
-   Stores n+1 limbs in rp, and returns the (n+2)'nd limb. */
-/* FIXME: With nails, we can instead return limb n+1, possibly including
-   one non-zero nail bit. */
-static mp_limb_t
-mpn_addmul2_n_1 (mp_ptr rp, mp_size_t n,
-		 mp_srcptr ap, mp_limb_t u,
-		 mp_srcptr bp, mp_limb_t v)
-{
-  mp_limb_t h;
-  mp_limb_t cy;
+      c1 =     mpn_mul_1 (M->p[row][1], M->p[row][1], M->n, M1->u[1][1]);
+      c1 += mpn_addmul_1 (M->p[row][1], tp,        M->n, M1->u[0][1]);
+      M->p[row][1][M->n] = c1;
 
-  h = mpn_mul_1 (rp, ap, n, u);
-  cy = mpn_addmul_1 (rp, bp, n, v);
-  h += cy;
-#if GMP_NAIL_BITS == 0
-  rp[n] = h;
-  return (h < cy);
-#else /* GMP_NAIL_BITS > 0 */
-  rp[n] = h & GMP_NUMB_MASK;
-  return h >> GMP_NUMB_BITS;
-#endif /* GMP_NAIL_BITS > 0 */
-}
-
-
-static inline void
-qstack_drop (struct qstack *stack)
-{
-  ASSERT (stack->size_next);
-  stack->limb_next -= stack->size[--stack->size_next];
-}
-
-/* Get top element */
-static inline mp_size_t
-qstack_get_0 (const struct qstack *stack,
-	      mp_srcptr *qp)
-{
-  mp_size_t qsize;
-  ASSERT (stack->size_next);
-
-  qsize = stack->size[stack->size_next - 1];
-  *qp = stack->limb + stack->limb_next - qsize;
-
-  return qsize;
+      grow |= (c0 | c1);
+    }
+  M->n += (grow != 0);
+  ASSERT (M->n < M->alloc);
 }
 
-/* Get element just below the top */
-static inline mp_size_t
-qstack_get_1 (const struct qstack *stack,
-	      mp_srcptr *qp)
-{
-  mp_size_t qsize;
-  ASSERT (stack->size_next >= 2);
+/* Perform a few steps, using some of mpn_hgcd2, subtraction and
+   division. Reduces the size by almost one limb or more, but never
+   below the given size s. Return new size for a and b, or 0 if no
+   more steps are possible.
 
-  qsize = stack->size[stack->size_next - 2];
-  *qp = stack->limb + stack->limb_next
-    - stack->size[stack->size_next - 1]
-    - qsize;
+   If hgcd2 succeds, needs temporary space for hgcd_matrix_mul_1, M->n
+   limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
+   fails, needs space for the quotient, qn <= n - s + 1 limbs, for and
+   hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
+   resulting size of $.
 
-  return qsize;
-}
+   If N is the input size to the calling hgcd, then s = floor(N/2) +
+   1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1
+   < N, so N is sufficient.
+*/
 
-/* Adds d to the element on top of the stack */
-static void
-qstack_adjust (struct qstack *stack, mp_limb_t d)
+static mp_size_t
+hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
+	   struct hgcd_matrix *M, mp_ptr tp)
 {
-  mp_size_t qsize;
+  struct hgcd_matrix1 M1;
+  mp_limb_t mask;
+  mp_limb_t ah, al, bh, bl;
+  mp_size_t an, bn, qn;
   mp_ptr qp;
+  int col;
 
-  ASSERT (stack->size_next);
+  ASSERT (n > s);
 
-  ASSERT_QSTACK (stack);
+  mask = ap[n-1] | bp[n-1];
+  ASSERT (mask > 0);
 
-  if (stack->limb_next >= stack->limb_alloc)
+  if (n == s + 1)
     {
-      qstack_rotate (stack, 1);
-    }
-
-  ASSERT (stack->limb_next < stack->limb_alloc);
+      if (mask < 4)
+	goto subtract;
 
-  qsize = stack->size[stack->size_next - 1];
-  qp = stack->limb + stack->limb_next - qsize;
-
-  if (qsize == 0)
-    {
-      qp[0] = 1 + d;
-      stack->size[stack->size_next - 1] = 1;
-      stack->limb_next++;
+      ah = ap[n-1]; al = ap[n-2];
+      bh = bp[n-1]; bl = bp[n-2];
     }
-  else
+  else if (mask & GMP_NUMB_HIGHBIT)
     {
-      mp_limb_t cy = mpn_add_1 (qp, qp, qsize, d);
-      if (cy)
-	{
-	  qp[qsize] = cy;
-	  stack->size[stack->size_next - 1]++;
-	  stack->limb_next++;
-	}
+      ah = ap[n-1]; al = ap[n-2];
+      bh = bp[n-1]; bl = bp[n-2];
     }
-
-  ASSERT_QSTACK (stack);
-}
-
-/* hgcd2 operations */
-
-/* Computes P = R * S. No overlap allowed. */
-static mp_size_t
-hgcd2_mul (struct hgcd_row *P, mp_size_t alloc,
-	   const struct hgcd2_row *R,
-	   const struct hgcd_row *S, mp_size_t n)
-{
-  int grow = 0;
-  mp_limb_t h = 0;
-  unsigned i;
-  unsigned j;
-
-  ASSERT (n < alloc);
-
-  for (i = 0; i < 2; i++)
-    for (j = 0; j < 2; j++)
-      {
-	/* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j]
-		       = u_i s0j + v_i s1j */
-	mp_limb_t cy;
-
-	cy = mpn_addmul2_n_1 (P[i].uvp[j], n,
-			      S[0].uvp[j], R[i].u,
-			      S[1].uvp[j], R[i].v);
-	if (cy)
-	  {
-	    ASSERT (n + 2 <= alloc);
-	    P[i].uvp[j][n+1] = cy;
-	    grow = 1;
-	  }
-	else
-	  h |= P[i].uvp[j][n];
-      }
-  if (grow)
-    return n + 2;
   else
-    /* Don't add redundant zeroes */
-    return n + (h != 0);
-}
-
-unsigned
-mpn_hgcd_max_recursion (mp_size_t n)
-{
-  int count;
-
-  count_leading_zeros (count, (mp_limb_t)
-		       (1 + n / (HGCD_SCHOENHAGE_THRESHOLD  - 5)));
-
-  return GMP_LIMB_BITS - count;
-}
-
-mp_size_t
-mpn_hgcd_init_itch (mp_size_t size)
-{
-  /* r0 <= a, r1, r2, r3 <= b, but for simplicity, we allocate asize +
-     1 for all of them. The size of the uv:s are limited to asize / 2,
-     but we allocate one extra limb. */
-
-  return 4 * (size + 1) + 8 * ((size / 2) + 1);
-}
-
-void
-mpn_hgcd_init (struct hgcd *hgcd,
-	       mp_size_t asize,
-	       mp_limb_t *limbs)
-{
-  unsigned i;
-  unsigned j;
-  mp_size_t alloc = (asize / 2) + 1;
-
-  hgcd->sign = 0;
-
-  for (i = 0; i < 4; i++)
-    {
-      hgcd->row[i].rp = limbs;
-      hgcd->row[i].rsize = asize + 1; limbs += asize + 1;
-    }
-
-  hgcd->alloc = alloc;
-  hgcd->size = alloc;
-
-  for (i = 0; i < 4; i++)
-    for (j = 0; j < 2; j++)
-      {
-	hgcd->row[i].uvp[j] = limbs;
-	limbs += alloc;
-      }
-}
-
-#if WANT_ASSERT
-void
-__gmpn_hgcd_sanity (const struct hgcd *hgcd,
-		    mp_srcptr ap, mp_size_t asize,
-		    mp_srcptr bp, mp_size_t bsize,
-		    unsigned start, unsigned end)
-{
-  int sign;
-  unsigned i;
-  mp_size_t L = hgcd->size;
-  mp_ptr tp;
-  mp_size_t talloc;
-  mp_ptr t1p;
-  mp_ptr t2p;
-  const struct hgcd_row *r;
-
-  ASSERT (asize >= bsize);
-
-  ASSERT (L <= asize / 2);
-  ASSERT (L);
-
-  ASSERT (L <= asize);
-  ASSERT (L <= bsize);
-
-  /* NOTE: We really need only asize + bsize + 2*L, but since we're
-   * swapping the pointers around, we allocate 2*(asize + L). */
-  talloc = 2*(asize + L);
-  tp = __GMP_ALLOCATE_FUNC_LIMBS (talloc);
-  t1p = tp;
-  t2p = t1p + (asize + L);
-
-  sign = hgcd->sign;
-  if (start % 2)
-    sign = ~sign;
-  for (i = start, r = &hgcd->row[start]; i < end; i++, sign = ~sign, r++)
     {
-      mp_size_t t1size = asize + L;
-      mp_size_t t2size = bsize + L;
-
-      mp_size_t k;
-      for (k = hgcd->size; k < hgcd->alloc; k++)
-	{
-	  ASSERT (r->uvp[0][k] == 0);
-	  ASSERT (r->uvp[1][k] == 0);
-	}
-
-      mpn_mul (t1p, ap, asize, r->uvp[0], L);
-      mpn_mul (t2p, bp, bsize, r->uvp[1], L);
-
-      if (sign < 0)
-	MPN_PTR_SWAP (t1p, t1size, t2p, t2size);
+      int shift;
 
-      MPN_NORMALIZE (t2p, t2size);
-      ASSERT (t2size <= t1size);
-      ASSERT_NOCARRY (mpn_sub (t1p, t1p, t1size, t2p, t2size));
-
-      MPN_NORMALIZE (t1p, t1size);
-      ASSERT (MPN_EQUAL_P (t1p, t1size, r->rp, r->rsize));
-    }
-  __GMP_FREE_FUNC_LIMBS (tp, talloc);
-  for (i = start; i < end - 1; i++)
-    {
-      /* We should have strict inequality after each reduction step,
-	 but we allow equal values for input. */
-      ASSERT (MPN_LEQ_P (hgcd->row[i+1].rp, hgcd->row[i+1].rsize,
-			 hgcd->row[i].rp, hgcd->row[i].rsize));
+      count_leading_zeros (shift, mask);
+      ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+      al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+      bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+      bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
     }
-}
-#endif /* WANT_ASSERT */
-
-/* Helper functions for hgcd */
-/* Sets (a, b, c, d)  <--  (b, c, d, a) */
-#define HGCD_SWAP4_LEFT(row)				\
-do {							\
-  struct hgcd_row __hgcd_swap4_left_tmp;                \
-  __hgcd_swap4_left_tmp = row[0];                       \
-  row[0] = row[1];					\
-  row[1] = row[2];					\
-  row[2] = row[3];					\
-  row[3] = __hgcd_swap4_left_tmp;			\
-} while (0)
-
-/* Sets (a, b, c, d)  <--  (d, a, b, c) */
-#define HGCD_SWAP4_RIGHT(row)				\
-do {							\
-  struct hgcd_row __hgcd_swap4_right_tmp;               \
-  __hgcd_swap4_right_tmp = row[3];                      \
-  row[3] = row[2];					\
-  row[2] = row[1];					\
-  row[1] = row[0];					\
-  row[0] = __hgcd_swap4_right_tmp;			\
-} while (0)
-
-/* Sets (a, b, c, d)  <--  (c, d, a, b) */
-#define HGCD_SWAP4_2(row)				\
-do {							\
-  struct hgcd_row __hgcd_swap4_2_tmp;                   \
-  __hgcd_swap4_2_tmp = row[0];                          \
-  row[0] = row[2];					\
-  row[2] = __hgcd_swap4_2_tmp;				\
-  __hgcd_swap4_2_tmp = row[1];				\
-  row[1] = row[3];					\
-  row[3] = __hgcd_swap4_2_tmp;				\
-} while (0)
-
-/* Sets (a, b, c)  <--	(b, c, a) */
-#define HGCD_SWAP3_LEFT(row)				\
-do {							\
-  struct hgcd_row __hgcd_swap4_left_tmp;                \
-  __hgcd_swap4_left_tmp = row[0];                       \
-  row[0] = row[1];					\
-  row[1] = row[2];					\
-  row[2] = __hgcd_swap4_left_tmp;			\
-} while (0)
-
-/* Computes P = R * S. No overlap allowed.
-
-   Temporary space is needed for two numbers smaller than the
-   resulting matrix elements, i.e. bounded by 2*L <= N. */
-static mp_size_t
-hgcd_mul (struct hgcd_row *P, mp_size_t alloc,
-	  const struct hgcd_row *R, mp_size_t rsize,
-	  const struct hgcd_row *S, mp_size_t ssize,
-	  mp_ptr tp, mp_size_t talloc)
-{
-  unsigned i;
-  unsigned j;
-
-  mp_size_t psize;
-  mp_limb_t h = 0;
-  int grow = 0;
 
-  MPN_NORMALIZE (R[1].uvp[1], rsize);
-  ASSERT (S[1].uvp[1][ssize - 1] != 0);
-
-  psize = rsize + ssize;
-  ASSERT (psize <= talloc);
-
-  if (rsize >= ssize)
-    {
-      for (i = 0; i < 2; i++)
-	for (j = 0; j < 2; j++)
-	  {
-	    /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */
-	    mp_limb_t cy;
-
-	    mpn_mul (P[i].uvp[j], R[i].uvp[0], rsize, S[0].uvp[j], ssize);
-	    mpn_mul (tp, R[i].uvp[1], rsize, S[1].uvp[j], ssize);
-
-	    cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize);
-
-	    if (cy)
-	      {
-		ASSERT (psize + 1 < alloc);
-		P[i].uvp[j][psize] = cy;
-		grow = 1;
-	      }
-	    else
-	      h |= P[i].uvp[j][psize - 1];
-	  }
-    }
-  else
+  /* Try an mpn_hgcd2 step */
+  if (mpn_hgcd2 (ah, al, bh, bl, &M1))
     {
-      for (i = 0; i < 2; i++)
-	for (j = 0; j < 2; j++)
-	  {
-	    /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */
-	    mp_limb_t cy;
-
-	    mpn_mul (P[i].uvp[j], S[0].uvp[j], ssize, R[i].uvp[0], rsize);
-	    mpn_mul (tp, S[1].uvp[j], ssize, R[i].uvp[1], rsize);
-
-	    cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize);
-
-	    if (cy)
-	      {
-		ASSERT (psize + 1 < alloc);
-		P[i].uvp[j][psize] = cy;
-		grow = 1;
-	      }
-	    else
-	      h |= P[i].uvp[j][psize - 1];
-	  }
-    }
-
-  if (grow)
-    return psize + 1;
-  else
-    return psize - (h == 0);
-}
-
-/* Computes R = W^k s->r + s->u A' - s->v B', which must be
-   non-negative. W denotes 2^(GMP_NUMB_BITS). Temporary space needed
-   is k + uvsize <= M + L = N.
-
-   Must have v > 0, v >= u. */
-
-mp_size_t
-mpn_hgcd_fix (mp_size_t k,
-	      mp_ptr rp, mp_size_t ralloc,
-	      int sign, mp_size_t uvsize,
-	      const struct hgcd_row *s,
-	      mp_srcptr ap,
-	      mp_srcptr bp,
-	      mp_ptr tp, mp_size_t talloc)
-{
-  mp_size_t tsize;
-  mp_limb_t cy;
-  mp_size_t rsize;
-  mp_srcptr up;
-  mp_srcptr vp;
+      /* Multiply M <- M * M1 */
+      hgcd_matrix_mul_1 (M, &M1, tp);
 
-  up = s->uvp[0]; vp = s->uvp[1];
-  MPN_NORMALIZE (vp, uvsize);
-  ASSERT (uvsize > 0);
-
-  if (sign < 0)
-    {
-      MP_SRCPTR_SWAP (up, vp);
-      MP_SRCPTR_SWAP (ap, bp);
+      /* Multiply M1^{-1} (a;b) */
+      return mpn_hgcd_mul_matrix1_inverse_vector (&M1, n, ap, bp, tp);
     }
 
-  tsize = k + uvsize;
+ subtract:
+  /* There are two ways in which mpn_hgcd2 can fail. Either one of ah and
+     bh was too small, or ah, bh were (almost) equal. Perform one
+     subtraction step (for possible cancellation of high limbs),
+     followed by one division. */
 
-  ASSERT (k + s->rsize <= ralloc);
-  ASSERT (tsize <= talloc);
-  ASSERT (tsize <= ralloc);
+  /* Since we must ensure that #(a-b) > s, we handle cancellation of
+     high limbs explicitly up front. (FIXME: Or is it better to just
+     subtract, normalize, and use an addition to undo if it turns out
+     the the difference is too small?) */
+  for (an = n; an > s; an--)
+    if (ap[an-1] != bp[an-1])
+      break;
 
-  ASSERT (rp != s->rp);
-
-  /* r = W^k s + u a */
-  if (uvsize <= k)
-    mpn_mul (rp, ap, k, up, uvsize);
-  else
-    mpn_mul (rp, up, uvsize, ap, k);
+  if (an == s)
+    return 0;
 
-  if (uvsize <= s->rsize)
+  /* Maintain a > b. When needed, swap a and b, and let col keep track
+     of how to update M. */
+  if (ap[an-1] > bp[an-1])
     {
-      cy = mpn_add (rp + k, s->rp, s->rsize, rp + k, uvsize);
-      rsize = k + s->rsize;
+      /* a is largest. In the subtraction step, we need to update
+	 column 1 of M */
+      col = 1;
     }
   else
     {
-      cy = mpn_add (rp + k, rp + k, uvsize, s->rp, s->rsize);
-      rsize = k + uvsize;
-    }
-
-  if (cy)
-    {
-      ASSERT (rsize < ralloc);
-      rp[rsize++] = cy;
+      MP_PTR_SWAP (ap, bp);
+      col = 0;
     }
 
-  /* r -= v b */
-
-  if (uvsize <= k)
-    mpn_mul (tp, bp, k, vp, uvsize);
-  else
-    mpn_mul (tp, vp, uvsize, bp, k);
-
-  ASSERT_NOCARRY (mpn_sub (rp, rp, rsize, tp, tsize));
-  MPN_NORMALIZE (rp, rsize);
-
-  return rsize;
-}
+  bn = n;
+  MPN_NORMALIZE (bp, bn);  
+  if (bn <= s)
+    return 0;
+  
+  /* We have #a, #b > s. When is it possible that #(a-b) < s? For
+     cancellation to happen, the numbers must be of the form
 
-/* Compute r2 = r0 - q r1 */
-static void
-hgcd_update_r (struct hgcd_row *r, mp_srcptr qp, mp_size_t qsize)
-{
-  mp_srcptr r0p = r[0].rp;
-  mp_srcptr r1p = r[1].rp;
-  mp_ptr r2p = r[2].rp;
-  mp_size_t r0size = r[0].rsize;
-  mp_size_t r1size = r[1].rsize;
+       a = x + 1, 0,            ..., 0,            al
+       b = x    , GMP_NUMB_MAX, ..., GMP_NUMB_MAX, bl
 
-  ASSERT (MPN_LESS_P (r1p, r1size, r0p, r0size));
+     where al, bl denotes the least significant k limbs. If al < bl,
+     then #(a-b) < k, and if also high(al) != 0, high(bl) != GMP_NUMB_MAX,
+     then #(a-b) = k. If al >= bl, then #(a-b) = k + 1. */
 
-  if (qsize == 0)
-    {
-      ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r1p, r1size));
-    }
-  else if (qsize == 1)
+  if (ap[an-1] == bp[an-1] + 1)
     {
-      mp_size_t size;
-      mp_limb_t cy = mpn_mul_1 (r2p, r1p, r1size, qp[0]);
-      size = r1size;
+      mp_size_t k;
+      int c;
+      for (k = an-1; k > s; k--)
+	if (ap[k-1] != 0 || bp[k-1] != GMP_NUMB_MAX)
+	  break;
 
-      if (cy)
+      MPN_CMP (c, ap, bp, k);
+      if (c < 0)
 	{
-	  ASSERT (size < r0size);
-	  r2p[size++] = cy;
+	  mp_limb_t cy;
+	  
+	  /* The limbs from k and up are cancelled. */
+	  if (k == s)
+	    return 0;
+	  cy = mpn_sub_n (ap, ap, bp, k);
+	  ASSERT (cy == 1);
+	  an = k;
 	}
-
-      ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r2p, size));
-    }
-  else
-    {
-      mp_size_t size = r1size + qsize;
-      ASSERT (size <= r0size + 1);
-
-      if (qsize <= r1size)
-	mpn_mul (r2p, r1p, r1size, qp, qsize);
       else
-	mpn_mul (r2p, qp, qsize, r1p, r1size);
-
-      if (size > r0size)
 	{
-	  ASSERT (size == r0size + 1);
-	  size--;
-	  ASSERT (r2p[size] == 0);
+	  ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, k));
+	  ap[k] = 1;
+	  an = k + 1;
 	}
-
-      ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r2p, size));
     }
+  else
+    ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, an));
+  
+  ASSERT (an > s);
+  ASSERT (ap[an-1] > 0);
+  ASSERT (bn > s);
+  ASSERT (bp[bn-1] > 0);
+  
+  hgcd_matrix_update_1 (M, col);
 
-  MPN_NORMALIZE (r[2].rp, r0size);
-  r[2].rsize = r0size;
-
-  ASSERT (MPN_LESS_P (r2p, r0size, r1p, r1size));
-}
-
-/* Compute (u2, v2) = (u0, v0) + q (u1, v1)
-   Return the size of the largest u,v element.
-   Caller must ensure that usize + qsize <= available storage */
-static mp_size_t
-hgcd_update_uv (struct hgcd_row *r, mp_size_t usize,
-		mp_srcptr qp, mp_size_t qsize)
-{
-  unsigned i;
-  mp_size_t grow;
-
-  ASSERT (r[1].uvp[1][usize - 1] != 0);
-
-  /* Compute u2	 = u0 + q u1 */
-
-  if (qsize == 0)
+  if (an < bn)
     {
-      /* Represents a unit quotient */
-      mp_limb_t cy;
-
-      cy = mpn_add_n (r[2].uvp[0], r[0].uvp[0], r[1].uvp[0], usize);
-      r[2].uvp[0][usize] = cy;
-
-      cy = mpn_add_n (r[2].uvp[1], r[0].uvp[1], r[1].uvp[1], usize);
-      r[2].uvp[1][usize] = cy;
-      grow = cy;
+      MPN_PTR_SWAP (ap, an, bp, bn);
+      col ^= 1;
     }
-  else if (qsize == 1)
+  else if (an == bn)
     {
-      mp_limb_t q = qp[0];
-      for (i = 0; i < 2; i++)
+      int c;
+      MPN_CMP (c, ap, bp, an);
+      if (c < 0)
 	{
-	  mp_srcptr u0p = r[0].uvp[i];
-	  mp_srcptr u1p = r[1].uvp[i];
-	  mp_ptr u2p = r[2].uvp[i];
-	  mp_limb_t cy;
-
-	  /* Too bad we don't have an addmul_1 with distinct source and
-	     destination */
-	  cy = mpn_mul_1 (u2p, u1p, usize, q);
-	  cy += mpn_add_n (u2p, u2p, u0p, usize);
-
-	  u2p[usize] = cy;
-	  grow = cy != 0;
+	  MP_PTR_SWAP (ap, bp);
+	  col ^= 1;
 	}
     }
-  else
-    {
-      for (i = 0; i < 2; i++)
-	{
-	  mp_srcptr u0p = r[0].uvp[i];
-	  mp_srcptr u1p = r[1].uvp[i];
-	  mp_ptr u2p = r[2].uvp[i];
-
-	  if (qsize <= usize)
-	    mpn_mul (u2p, u1p, usize, qp, qsize);
-	  else
-	    mpn_mul (u2p, qp, qsize, u1p, usize);
 
-	  ASSERT_NOCARRY (mpn_add (u2p, u2p, usize + qsize, u0p, usize));
-	  grow = qsize - ((u2p[usize + qsize - 1]) == 0);
-	}
-    }
+  /* Divide a / b. */
+  qn = an + 1 - bn;
 
-  usize += grow;
+  /* FIXME: We could use an approximate division, that may return a
+     too small quotient, and only guarantess that the size of r is
+     almost the size of b. FIXME: Let ap and remainder overlap. */
+  mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
+  qn -= (tp[qn -1] == 0);
 
-  /* The values should be allocated with one limb margin */
-  ASSERT (mpn_cmp (r[1].uvp[0], r[2].uvp[0], usize) <= 0);
-  ASSERT (mpn_cmp (r[1].uvp[1], r[2].uvp[1], usize) <= 0);
-  ASSERT (r[2].uvp[1][usize - 1] != 0);
+  /* Normalize remainder */
+  an = bn;
+  for ( ; an > s; an--)
+    if (ap[an-1] > 0)
+      break;
 
-  return usize;
-}
-
-/* Compute r0 = r2 + q r1, and the corresponding uv */
-static void
-hgcd_backup (struct hgcd_row *r, mp_size_t usize,
-	     mp_srcptr qp, mp_size_t qsize)
-{
-  mp_ptr r0p = r[0].rp;
-  mp_srcptr r1p = r[1].rp;
-  mp_srcptr r2p = r[2].rp;
-  mp_size_t r0size;
-  mp_size_t r1size = r[1].rsize;
-  mp_size_t r2size = r[2].rsize;
-
-  mp_ptr u0p = r[0].uvp[0];
-  mp_ptr v0p = r[0].uvp[1];
-  mp_srcptr u1p = r[1].uvp[0];
-  mp_srcptr v1p = r[1].uvp[1];
-  mp_srcptr u2p = r[2].uvp[0];
-  mp_srcptr v2p = r[2].uvp[1];
-
-  ASSERT (MPN_LESS_P (r2p, r2size, r1p, r1size));
-
-  if (qsize == 0)
-    {
-      /* r0 = r2 + r1 */
-      mp_limb_t cy = mpn_add (r0p, r1p, r1size, r2p, r2size);
-      r0size = r1size;
-      if (cy)
-	r0p[r0size++] = cy;
-
-      /* (u0,v0) = (u2,v2) - (u1, v1) */
-
-      ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u1p, usize));
-      ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v1p, usize));
-    }
-  else if (qsize == 1)
+  if (an <= s)
     {
-      /* r0 = r2 + q r1
-
-      Just like for mpn_addmul_1, the result is the same size as r1, or
-      one limb larger. */
-
+      /* Quotient is too large */
       mp_limb_t cy;
 
-      cy = mpn_mul_1 (r0p, r1p, r1size, qp[0]);
-      cy += mpn_add (r0p, r0p, r1size, r2p, r2size);
-
-      r0size = r1size;
-      if (cy)
-	r0p[r0size++] = cy;
-
-      /* (u0,v0) = (u2,v2) - q (u1, v1) */
-
-      ASSERT_NOCARRY (mpn_mul_1 (u0p, u1p, usize, qp[0]));
-      ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize));
-
-      ASSERT_NOCARRY (mpn_mul_1 (v0p, v1p, usize, qp[0]));
-      ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v0p, usize));
-    }
-  else
-    {
-      /* r0 = r2 + q r1
-
-	 Result must be of size r1size + q1size - 1, or one limb
-	 larger. */
-
-      mp_size_t size;
-
-      r0size = r1size + qsize;
-      if (r1size >= qsize)
-	mpn_mul (r0p, r1p, r1size, qp, qsize);
-      else
-	mpn_mul (r0p, qp, qsize, r1p, r1size);
-
-      ASSERT_NOCARRY (mpn_add (r0p, r0p, r0size, r2p, r2size));
+      cy = mpn_add (ap, bp, bn, ap, an);
 
-      r0size -= (r0p[r0size-1] == 0);
-
-      /* (u0,v0) = (u2,v2) - q (u1, v1) */
-
-      /* We must have
-
-	   usize >= #(q u1) >= qsize + #u1 - 1
-
-	 which means that u1 must have at least
-
-	   usize - #u1 >= qsize - 1
-
-	 zero limbs at the high end, and similarly for v1. */
-
-      ASSERT (qsize <= usize);
-      size = usize - qsize + 1;
-#if WANT_ASSERT
-      {
-	mp_size_t i;
-	for (i = size; i < usize; i++)
-	  {
-	    ASSERT (u1p[i] == 0);
-	    ASSERT (v1p[i] == 0);
-	  }
-      }
-#endif
-      /* NOTE: Needs an extra limb for the u,v values */
-
-      if (qsize <= size)
-	{
-	  mpn_mul (u0p, u1p, size, qp, qsize);
-	  mpn_mul (v0p, v1p, size, qp, qsize);
-	}
-      else
+      if (cy > 0)
 	{
-	  mpn_mul (u0p, qp, qsize, u1p, size);
-	  mpn_mul (v0p, qp, qsize, v1p, size);
+	  ASSERT (bn < n);
+	  ap[bn] = cy;
+	  bp[bn] = 0;
+	  bn++;
 	}
 
-      /* qsize + size = usize + 1 */
-      ASSERT (u0p[usize] == 0);
-      ASSERT (v0p[usize] == 0);
-
-      ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize));
-      ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v0p, usize));
+      MPN_DECR_U (tp, qn, 1);
+      qn -= (tp[qn-1] == 0);
     }
 
-  r[0].rsize = r0size;
-}
-
-/* Called after HGCD_SWAP4_RIGHT, to adjust the size field. Large
-   numbers in row 0 don't count, and are overwritten. */
-static void
-hgcd_normalize (struct hgcd *hgcd)
-{
-  mp_size_t size = hgcd->size;
-
-  /* v3 should always be the largest element */
-  while (size > 0 && hgcd->row[3].uvp[1][size - 1] == 0)
-    {
-      size--;
-      /* Row 0 is about to be overwritten. We must zero out unused limbs */
-      hgcd->row[0].uvp[0][size] = 0;
-      hgcd->row[0].uvp[1][size] = 0;
-
-      ASSERT (hgcd->row[1].uvp[0][size] == 0);
-      ASSERT (hgcd->row[1].uvp[1][size] == 0);
-      ASSERT (hgcd->row[2].uvp[0][size] == 0);
-      ASSERT (hgcd->row[2].uvp[1][size] == 0);
-      ASSERT (hgcd->row[3].uvp[0][size] == 0);
-    }
+  if (qn > 0)
+    hgcd_matrix_update_q (M, tp, qn, col, tp + qn);
 
-  hgcd->size = size;
+  return bn;
 }
 
-int
-mpn_hgcd2_lehmer_step (struct hgcd2 *hgcd,
-		       mp_srcptr ap, mp_size_t asize,
-		       mp_srcptr bp, mp_size_t bsize,
-		       struct qstack *quotients)
+/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
+   with elements of size at most (n+1)/2 - 1. Returns new size of a,
+   b, or zero if no reduction is possible. */
+mp_size_t
+mpn_hgcd_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n,
+		 struct hgcd_matrix *M, mp_ptr tp)
 {
-  mp_limb_t ah;
-  mp_limb_t al;
-  mp_limb_t bh;
-  mp_limb_t bl;
+  mp_size_t s = n/2 + 1;
+  mp_size_t nn;
 
-  ASSERT (asize >= bsize);
-  ASSERT (MPN_LEQ_P (bp, bsize, ap, asize));
+  ASSERT (n > s);
+  ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
 
-  if (bsize < 2)
+  nn = hgcd_step (n, ap, bp, s, M, tp);
+  if (!nn)
     return 0;
 
-#if 0 && WANT_TRACE
-  trace ("lehmer_step:\n"
-	 "  a = %Nd\n"
-	 "  b = %Nd\n",
-	 ap, asize, bp, bsize);
-#endif
-#if WANT_TRACE
-  trace ("lehmer_step: asize = %d, bsize = %d\n", asize, bsize);
-#endif
-
-  /* The case asize == 2 is needed to take care of values that are
-     between one and two *full* limbs in size. */
-  if (asize == 2 || (ap[asize-1] & GMP_NUMB_HIGHBIT))
-    {
-      if (bsize < asize)
-	return 0;
-
-      al = ap[asize - 2];
-      ah = ap[asize - 1];
-
-      ASSERT (asize == bsize);
-      bl = bp[asize - 2];
-      bh = bp[asize - 1];
-    }
-  else
-    {
-      unsigned shift;
-      if (bsize + 1 < asize)
-	return 0;
-
-      /* We want two *full* limbs */
-      ASSERT (asize > 2);
-
-      count_leading_zeros (shift, ap[asize-1]);
-#if 0 && WANT_TRACE
-      trace ("shift = %d\n", shift);
-#endif
-      if (bsize == asize)
-	bh = MPN_EXTRACT_LIMB (shift, bp[asize - 1], bp[asize - 2]);
-      else
-	{
-	  ASSERT (asize == bsize + 1);
-	  bh = bp[asize - 2] >> (GMP_LIMB_BITS - shift);
-	}
-
-      bl = MPN_EXTRACT_LIMB (shift, bp[asize - 2], bp[asize - 3]);
-
-      al = MPN_EXTRACT_LIMB (shift, ap[asize - 2], ap[asize - 3]);
-      ah = MPN_EXTRACT_LIMB (shift, ap[asize - 1], ap[asize - 2]);
-    }
-
-#if WANT_TRACE
-  trace ("lehmer_step: ah = %lx, al = %lx, bh = %lx, bl = %lx\n",
-	 (unsigned long) ah, (unsigned long) al,
-	 (unsigned long) bh, (unsigned long) bl);
-#endif
-  return mpn_hgcd2 (hgcd, ah, al, bh, bl, quotients);
-}
-
-/* Called when r2 has been computed, and it is too small. Top element
-   on the stack is r0/r1. One backup step is needed. */
-static int
-hgcd_small_1 (struct hgcd *hgcd, mp_size_t M,
-	      struct qstack *quotients)
-{
-  mp_srcptr qp;
-  mp_size_t qsize;
-
-  if (hgcd_start_row_p (hgcd->row, hgcd->size))
+  for (;;)
     {
-      qstack_drop (quotients);
-      return 0;
+      n = nn;
+      ASSERT (n > s);
+      nn = hgcd_step (n, ap, bp, s, M, tp);
+      if (!nn )
+	return n;      
     }
-
-  HGCD_SWAP4_RIGHT (hgcd->row);
-  hgcd_normalize (hgcd);
-
-  qsize = qstack_get_1 (quotients, &qp);
-
-  hgcd_backup (hgcd->row, hgcd->size, qp, qsize);
-  hgcd->sign = ~hgcd->sign;
-
-#if WANT_ASSERT
-  qstack_rotate (quotients, 0);
-#endif
-
-  return hgcd_jebelean (hgcd, M);
 }
 
-/* Called when r3 has been computed, and is small enough. Two backup
-   steps are needed. */
-static int
-hgcd_small_2 (struct hgcd *hgcd, mp_size_t M,
-	      const struct qstack *quotients)
+/* Multiply M by M1 from the right. Needs 4*(M->n + M1->n) + 5 limbs
+   of temporary storage (see mpn_matrix22_mul_itch). */
+void
+mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1,
+		     mp_ptr tp)
 {
-  mp_srcptr qp;
-  mp_size_t qsize;
+  mp_size_t n;
 
-  if (hgcd_start_row_p (hgcd->row + 2, hgcd->size))
-    return 0;
+  /* About the new size of M:s elements. Since M1's diagonal elements
+     are > 0, no element can decrease. The new elements are of size
+     M->n + M1->n, one limb more or less. The computation of the
+     matrix product produces elements of size M->n + M1->n + 1. But
+     the true size, after normalization, may be two limbs smaller. */
 
-  qsize = qstack_get_0 (quotients, &qp);
-  hgcd_backup (hgcd->row+1, hgcd->size, qp, qsize);
+  /* FIXME: Strassen multiplication gives only a small speedup. In FFT
+     multiplication range, this function could be sped up quite a lot
+     using invariance. */
+  ASSERT (M->n + M1->n < M->alloc);
 
-  if (hgcd_start_row_p (hgcd->row + 1, hgcd->size))
-    return 0;
+  ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1]
+	   | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0);
 
-  qsize = qstack_get_1 (quotients, &qp);
-  hgcd_backup (hgcd->row, hgcd->size, qp, qsize);
+  ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1]
+	   | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0);
 
-  return hgcd_jebelean (hgcd, M);
-}
-
-static void
-hgcd_start (struct hgcd *hgcd,
-	    mp_srcptr ap, mp_size_t asize,
-	    mp_srcptr bp, mp_size_t bsize)
-{
-  MPN_COPY (hgcd->row[0].rp, ap, asize);
-  hgcd->row[0].rsize = asize;
+  mpn_matrix22_mul (M->p[0][0], M->p[0][1],
+		    M->p[1][0], M->p[1][1], M->n,
+		    M1->p[0][0], M1->p[0][1],
+		    M1->p[1][0], M1->p[1][1], M1->n, tp);
 
-  MPN_COPY (hgcd->row[1].rp, bp, bsize);
-  hgcd->row[1].rsize = bsize;
+  n = M->n + M1->n + 1;
+  n -= ((M->p[0][0][n-1] | M->p[0][1][n-1]
+	 | M->p[1][0][n-1] | M->p[1][1][n-1]) == 0);
+  n -= ((M->p[0][0][n-1] | M->p[0][1][n-1]
+	 | M->p[1][0][n-1] | M->p[1][1][n-1]) == 0);
 
-  hgcd->sign = 0;
-  if (hgcd->size != 0)
-    {
-      /* We must zero out the uv array */
-      unsigned i;
-      unsigned j;
+  ASSERT ((M->p[0][0][n-1] | M->p[0][1][n-1]
+	   | M->p[1][0][n-1] | M->p[1][1][n-1]) > 0);
 
-      for (i = 0; i < 4; i++)
-	for (j = 0; j < 2; j++)
-	  MPN_ZERO (hgcd->row[i].uvp[j], hgcd->size);
-    }
-#if WANT_ASSERT
-  {
-    unsigned i;
-    unsigned j;
-    mp_size_t k;
-
-    for (i = 0; i < 4; i++)
-      for (j = 0; j < 2; j++)
-	for (k = hgcd->size; k < hgcd->alloc; k++)
-	  ASSERT (hgcd->row[i].uvp[j][k] == 0);
-  }
-#endif
-
-  hgcd->size = 1;
-  hgcd->row[0].uvp[0][0] = 1;
-  hgcd->row[1].uvp[1][0] = 1;
+  M->n = n;
 }
 
-/* Performs one euclid step on r0, r1. Returns >= 0 if hgcd should be
-   terminated, -1 if we should go on */
-static int
-euclid_step (struct hgcd *hgcd, mp_size_t M,
-	     struct qstack *quotients)
+/* Multiplies the least significant p limbs of (a;b) by M^-1.
+   Temporary space needed: 2 * (p + M->n)*/
+mp_size_t
+mpn_hgcd_matrix_adjust (struct hgcd_matrix *M,
+			mp_size_t n, mp_ptr ap, mp_ptr bp,
+			mp_size_t p, mp_ptr tp)
 {
-  mp_size_t asize;
+  /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b)
+     = (r11 a - r01 b; - r10 a + r00 b */
 
-  mp_size_t qsize;
-  mp_size_t rsize;
-  mp_ptr qp;
-  mp_ptr rp;
+  mp_ptr t0 = tp;
+  mp_ptr t1 = tp + p + M->n;
+  mp_limb_t ah, bh;
+  mp_limb_t cy;
 
-  asize = hgcd->row[0].rsize;
-  rsize = hgcd->row[1].rsize;
-  qsize = asize - rsize + 1;
+  ASSERT (p + M->n  < n);
 
-  /* Make sure we have space on stack */
-  ASSERT_QSTACK (quotients);
+  /* First compute the two values depending on a, before overwriting a */
 
-  if (qsize > quotients->limb_alloc - quotients->limb_next)
-    {
-      qstack_rotate (quotients,
-		     qsize - (quotients->limb_alloc - quotients->limb_next));
-      ASSERT (quotients->size_next < QSTACK_MAX_QUOTIENTS);
-    }
-  else if (quotients->size_next >= QSTACK_MAX_QUOTIENTS)
+  if (M->n >= p)
     {
-      qstack_rotate (quotients, 0);
+      mpn_mul (t0, M->p[1][1], M->n, ap, p);
+      mpn_mul (t1, M->p[1][0], M->n, ap, p);
     }
-
-  ASSERT (qsize <= quotients->limb_alloc - quotients->limb_next);
-
-  qp = quotients->limb + quotients->limb_next;
-
-  rp = hgcd->row[2].rp;
-  mpn_tdiv_qr (qp, rp, 0, hgcd->row[0].rp, asize, hgcd->row[1].rp, rsize);
-  MPN_NORMALIZE (rp, rsize);
-  hgcd->row[2].rsize = rsize;
-
-  if (qp[qsize - 1] == 0)
-    qsize--;
-
-  if (qsize == 1 && qp[0] == 1)
-    qsize = 0;
-
-  quotients->size[quotients->size_next++] = qsize;
-  quotients->limb_next += qsize;
-
-  ASSERT_QSTACK (quotients);
-
-  /* Update u and v */
-  ASSERT (hgcd->size + qsize <= hgcd->alloc);
-  hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, qp, qsize);
-  ASSERT (hgcd->size < hgcd->alloc);
-
-  if (hgcd->row[2].rsize <= M)
-    return hgcd_small_1 (hgcd, M, quotients);
   else
     {
-      /* Keep this remainder */
-      hgcd->sign = ~hgcd->sign;
-
-      HGCD_SWAP4_LEFT (hgcd->row);
-      return -1;
+      mpn_mul (t0, ap, p, M->p[1][1], M->n);
+      mpn_mul (t1, ap, p, M->p[1][0], M->n);
     }
-}
 
-/* Called when values have been computed in r[0] and r[1], and the
-   latter value is too large, and we know that it's not much too
-   large. Returns the updated size for the uv matrix. */
-static mp_size_t
-hgcd_adjust (struct hgcd_row *r, mp_size_t size,
-	     struct qstack *quotients)
-{
-  mp_limb_t c0;
-  mp_limb_t c1;
-  mp_limb_t d;
-
-  /* Compute the correct r1. We have r1' = r1 - d r0, and we always
-     have d = 1 or 2. */
+  /* Update a */
+  MPN_COPY (ap, t0, p);
+  ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n);
 
-  ASSERT_NOCARRY (mpn_sub (r[1].rp, r[1].rp, r[1].rsize, r[0].rp, r[0].rsize));
+  if (M->n >= p)
+    mpn_mul (t0, M->p[0][1], M->n, bp, p);
+  else
+    mpn_mul (t0, bp, p, M->p[0][1], M->n);
 
-  MPN_NORMALIZE (r[1].rp, r[1].rsize);
+  cy = mpn_sub (ap, ap, n, t0, p + M->n);
+  ASSERT (cy <= ah);
+  ah -= cy;
 
-  if (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize))
-    {
-      c0 = mpn_add_n (r[1].uvp[0], r[1].uvp[0], r[0].uvp[0], size);
-      c1 = mpn_add_n (r[1].uvp[1], r[1].uvp[1], r[0].uvp[1], size);
-      d = 1;
-    }
+  /* Update b */
+  if (M->n >= p)
+    mpn_mul (t0, M->p[0][0], M->n, bp, p);
   else
-    {
-      ASSERT_NOCARRY (mpn_sub (r[1].rp, r[1].rp, r[1].rsize, r[0].rp, r[0].rsize));
-      MPN_NORMALIZE (r[1].rp, r[1].rsize);
-      ASSERT (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize));
+    mpn_mul (t0, bp, p, M->p[0][0], M->n);
 
-      c0 = mpn_addmul_1 (r[1].uvp[0], r[0].uvp[0], size, 2);
-      c1 = mpn_addmul_1 (r[1].uvp[1], r[0].uvp[1], size, 2);
-      d = 2;
-    }
+  MPN_COPY (bp, t0, p);
+  bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n);
+  cy = mpn_sub (bp, bp, n, t1, p + M->n);
+  ASSERT (cy <= bh);
+  bh -= cy;
 
-  /* FIXME: Can avoid branches */
-  if (c1 != 0)
+  if (ah > 0 || bh > 0)
     {
-      r[1].uvp[0][size] = c0;
-      r[1].uvp[1][size] = c1;
-      size++;
+      ap[n] = ah;
+      bp[n] = bh;
+      n++;
     }
   else
     {
-      ASSERT (c0 == 0);
+      /* The subtraction can reduce the size by at most one limb. */
+      if (ap[n-1] == 0 && bp[n-1] == 0)
+	n--;
     }
-
-  /* Remains to adjust the quotient on stack */
-  qstack_adjust (quotients, d);
-
-  return size;
+  ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
+  return n;  
 }
 
-/* Reduce using Lehmer steps. Called by mpn_hgcd when r1 has been
-   reduced to approximately the right size. Also used by
-   mpn_hgcd_lehmer. */
-static int
-hgcd_final (struct hgcd *hgcd, mp_size_t M,
-	    struct qstack *quotients)
-{
-  ASSERT (hgcd->row[0].rsize > M);
-  ASSERT (hgcd->row[1].rsize > M);
-
-  /* Can be equal when called by hgcd_lehmer. */
-  ASSERT (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize,
-		     hgcd->row[0].rp, hgcd->row[0].rsize));
-
-  for (;;)
-    {
-      mp_size_t L = hgcd->row[0].rsize;
-
-      struct hgcd2 R;
-      int res;
-
-      if (L <= M + 2
-	  && (L < M + 2 || (hgcd->row[0].rp[M+1] & GMP_NUMB_HIGHBIT) == 0))
-	break;
-
-      res = mpn_hgcd2_lehmer_step (&R,
-				   hgcd->row[0].rp, hgcd->row[0].rsize,
-				   hgcd->row[1].rp, hgcd->row[1].rsize,
-				   quotients);
-
-      if (res == 0)
-	{
-	  /* We must divide to make progress */
-	  res = euclid_step (hgcd, M, quotients);
-
-	  if (res >= 0)
-	    return res;
-	}
-      else if (res == 1)
-	{
-	  mp_size_t qsize;
-
-	  /* The quotient that has been computed for r2 is at most 2
-	     off. So adjust that, and avoid a full division. */
-	  qstack_drop (quotients);
-
-	  /* Top two rows of R must be the identity matrix, followed
-	     by a row (1, q). */
-	  ASSERT (R.row[0].u == 1 && R.row[0].v == 0);
-	  ASSERT (R.row[1].u == 0 && R.row[1].v == 1);
-	  ASSERT (R.row[2].u == 1);
-
-	  qsize = (R.row[2].v != 0);
-
-	  hgcd_update_r (hgcd->row, &R.row[2].v, qsize);
-	  hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
-				       &R.row[2].v, qsize);
-	  ASSERT (hgcd->size < hgcd->alloc);
-
-	  if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize,
-			 hgcd->row[2].rp, hgcd->row[2].rsize))
-	    hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients);
-
-	  ASSERT (hgcd->size < hgcd->alloc);
-
-	  hgcd->sign = ~hgcd->sign;
-	  HGCD_SWAP4_LEFT (hgcd->row);
-	}
-      else
-	{
-	  const struct hgcd2_row *s = R.row + (res - 2);
-	  int sign = R.sign;
-	  /* Max size after reduction, plus one */
-	  mp_size_t ralloc = hgcd->row[1].rsize + 1;
-
-	  if (res == 2)
-	    {
-	      qstack_drop (quotients);
-	      qstack_drop (quotients);
-	    }
-	  else if (res == 3)
-	    {
-	      sign = ~sign;
-	      qstack_drop (quotients);
-	    }
-
-	  /* s[0] and s[1] correct. */
-	  hgcd->row[2].rsize
-	    = mpn_hgcd2_fix (hgcd->row[2].rp, ralloc,
-			     sign,
-			     s[0].u, hgcd->row[0].rp, hgcd->row[0].rsize,
-			     s[0].v, hgcd->row[1].rp, hgcd->row[1].rsize);
-
-	  hgcd->row[3].rsize
-	    = mpn_hgcd2_fix (hgcd->row[3].rp, ralloc,
-			     ~sign,
-			     s[1].u, hgcd->row[0].rp, hgcd->row[0].rsize,
-			     s[1].v, hgcd->row[1].rp, hgcd->row[1].rsize);
-
-	  hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc,
-				  s, hgcd->row, hgcd->size);
-	  hgcd->sign ^= sign;
-
-	  ASSERT (hgcd->row[2].rsize > M);
-
-#if WANT_ASSERT
-	  switch (res)
-	    {
-	    default:
-	      ASSERT_ALWAYS (0 == "Unexpected value of res");
-	      break;
-	    case 2:
-	      ASSERT (hgcd->row[2].rsize >= L - 1);
-	      ASSERT (hgcd->row[3].rsize >= L - 2);
-	      ASSERT (hgcd->row[2].rsize > M + 1);
-	      ASSERT (hgcd->row[3].rsize > M);
-	      break;
-	    case 3:
-	      ASSERT (hgcd->row[2].rsize >= L - 2);
-	      ASSERT (hgcd->row[3].rsize >= L - 2);
-	      ASSERT (hgcd->row[3].rsize > M);
-	      break;
-	    case 4:
-	      ASSERT (hgcd->row[2].rsize >= L - 2);
-	      ASSERT (hgcd->row[3].rsize < L || hgcd->row[3].rp[L-1] == 1);
-	      break;
-	    }
-#endif
-	  if (hgcd->row[3].rsize <= M)
-	    {
-	      /* Can happen only in the res == 4 case */
-	      ASSERT (res == 4);
-
-	      /* Backup two steps */
-	      ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size));
-
-	      return hgcd_small_2 (hgcd, M, quotients);
-	    }
-
-	  HGCD_SWAP4_2 (hgcd->row);
-	}
-    }
-
-  ASSERT (hgcd->row[1].rsize > M);
-
-  for (;;)
-    {
-#if WANT_ASSERT
-      mp_size_t L = hgcd->row[0].rsize;
-#endif
-      mp_size_t ralloc;
-
-      mp_size_t qsize;
-      mp_srcptr qp;
-
-      struct hgcd2 R;
-      int res;
-
-      /* We don't want hgcd2 to pickup any bits below r0p[M-1], so
-	 don't tell mpn_hgcd2_lehmer_step about them. */
-      res = mpn_hgcd2_lehmer_step (&R,
-				   hgcd->row[0].rp+M-1, hgcd->row[0].rsize-M+1,
-				   hgcd->row[1].rp+M-1, hgcd->row[1].rsize-M+1,
-				   quotients);
-      if (res == 0)
-	{
-	  /* We must divide to make progress */
-	  res = euclid_step (hgcd, M, quotients);
-
-	  if (res >= 0)
-	    return res;
-
-	  continue;
-	}
-
-      if (res == 1)
-	{
-	  mp_size_t qsize;
-
-	  /* The quotient that has been computed for r2 is at most 2
-	     off. So adjust that, and avoid a full division. */
-	  qstack_drop (quotients);
-
-	  /* Top two rows of R must be the identity matrix, followed
-	     by a row (1, q). */
-	  ASSERT (R.row[0].u == 1 && R.row[0].v == 0);
-	  ASSERT (R.row[1].u == 0 && R.row[1].v == 1);
-	  ASSERT (R.row[2].u == 1);
-
-	  qsize = (R.row[2].v != 0);
+/* Size analysis for hgcd:
 
-	  hgcd_update_r (hgcd->row, &R.row[2].v, qsize);
-	  hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
-				       &R.row[2].v, qsize);
-	  ASSERT (hgcd->size < hgcd->alloc);
+   For the recursive calls, we have n1 <= ceil(n / 2). Then the
+   storage need is determined by the storage for the recursive call
+   computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1
+   (after this, the storage needed for M1 can be recycled).
 
-	  if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize,
-			 hgcd->row[2].rp, hgcd->row[2].rsize))
-	    hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients);
+   Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1)
+   = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2,
+   and for the hgcd_matrix_mul, we may need 4 ceil(n/2) + 1. In total,
+   4 * ceil(n/4) + 4 ceil(n/2) + 5 <= 12 ceil(n/4) + 5.
 
-	  ASSERT (hgcd->size < hgcd->alloc);
+   For the recursive call, we need S(n1) = S(ceil(n/2)).
 
-	  hgcd->sign = ~hgcd->sign;
-	  HGCD_SWAP4_LEFT (hgcd->row);
-
-	  continue;
-	}
-
-      /* Now r0 and r1 are always correct. */
-      /* Store new values in rows 2 and 3, to avoid overlap */
-
-      /* Max size after reduction, plus one */
-      ralloc = hgcd->row[1].rsize + 1;
-
-      hgcd->row[2].rsize
-	= mpn_hgcd2_fix (hgcd->row[2].rp, ralloc,
-			 R.sign,
-			 R.row[0].u, hgcd->row[0].rp, hgcd->row[0].rsize,
-			 R.row[0].v, hgcd->row[1].rp, hgcd->row[1].rsize);
-
-      hgcd->row[3].rsize
-	= mpn_hgcd2_fix (hgcd->row[3].rp, ralloc,
-			 ~R.sign,
-			 R.row[1].u, hgcd->row[0].rp, hgcd->row[0].rsize,
-			 R.row[1].v, hgcd->row[1].rp, hgcd->row[1].rsize);
-
-      ASSERT (hgcd->row[2].rsize >= L - 1);
-      ASSERT (hgcd->row[3].rsize >= L - 2);
-
-      ASSERT (hgcd->row[2].rsize > M);
-      ASSERT (hgcd->row[3].rsize > M-1);
-
-      hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc,
-			      R.row, hgcd->row, hgcd->size);
-      hgcd->sign ^= R.sign;
-
-      if (hgcd->row[3].rsize <= M)
-	{
-	  /* Backup two steps */
-
-	  /* We don't use R.row[2] and R.row[3], so drop the
-	     corresponding quotients. */
-	  qstack_drop (quotients);
-	  qstack_drop (quotients);
-
-	  return hgcd_small_2 (hgcd, M, quotients);
-	}
-
-      HGCD_SWAP4_2 (hgcd->row);
-
-      if (res == 2)
-	{
-	  qstack_drop (quotients);
-	  qstack_drop (quotients);
-
-	  continue;
-	}
-
-      /* We already know the correct q for computing r2 */
-
-      qsize = qstack_get_1 (quotients, &qp);
-      ASSERT (qsize < 2);
-
-      ASSERT (qsize + hgcd->size <= hgcd->alloc);
-      hgcd_update_r (hgcd->row, qp, qsize);
-      hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
-				   qp, qsize);
-      ASSERT (hgcd->size < hgcd->alloc);
-
-      ASSERT (hgcd->row[2].rsize >= M - 2);
-
-      if (hgcd->row[2].rsize <= M)
-	{
-	  /* Discard r3 */
-	  qstack_drop (quotients);
-	  return hgcd_small_1 (hgcd, M, quotients);
-	}
-      if (res == 3)
-	{
-	  /* Drop quotient for r3 */
-	  qstack_drop (quotients);
-
-	  hgcd->sign = ~hgcd->sign;
-	  HGCD_SWAP4_LEFT (hgcd->row);
-
-	  continue;
-	}
-
-      ASSERT (res == 4);
-      ASSERT (hgcd->row[2].rsize > M);
-
-      /* We already know the correct q for computing r3 */
-      qsize = qstack_get_0 (quotients, &qp);
-      ASSERT (qsize < 2);
-
-      ASSERT (qsize + hgcd->size <= hgcd->alloc);
-      hgcd_update_r (hgcd->row + 1, qp, qsize);
-      hgcd->size = hgcd_update_uv (hgcd->row + 1, hgcd->size,
-				   qp, qsize);
-      ASSERT (hgcd->size < hgcd->alloc);
-
-      ASSERT (hgcd->row[3].rsize <= M + 1);
-      /* Appearantly not true. Probably because we have leading zeros
-	 when we call hgcd2. */
-      /* ASSERT (hgcd->row[3].rsize <= M || hgcd->row[3].rp[M] == 1); */
-
-      if (hgcd->row[3].rsize <= M)
-	return hgcd_jebelean (hgcd, M);
-
-      HGCD_SWAP4_2 (hgcd->row);
-    }
-}
+   S(n) <= 12*ceil(n/4) + 5 + S(ceil(n/2))
+        <= 12*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 5k + S(ceil(n/2^k))
+        <= 12*(2 ceil(n/4) + k) + 5k + S(n/2^k)   
+	<= 24 ceil(n/4) + 17k + S(n/2^k)
+	
+*/
 
 mp_size_t
-mpn_hgcd_itch (mp_size_t asize)
+mpn_hgcd_itch (mp_size_t n)
 {
-  /* Scratch space is needed for calling hgcd. We need space for the
-     results of all recursive calls. In addition, we need space for
-     calling hgcd_fix and hgcd_mul, for which N = asize limbs should
-     be enough. */
+  unsigned k;
+  int count;
+  mp_size_t nscaled;
 
-  /* Limit on the recursion depth */
-  unsigned k = mpn_hgcd_max_recursion (asize);
+  if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
+    return MPN_HGCD_LEHMER_ITCH (n);
 
-  return asize + mpn_hgcd_init_itch (asize + 6 * k) + 12 * k;
-}
+  /* Get the recursion depth. */
+  nscaled = (n - 1) / (HGCD_THRESHOLD - 1);
+  count_leading_zeros (count, nscaled);
+  k = GMP_LIMB_BITS - count;
 
-/* Repeatedly divides A by B, until the remainder fits in M =
-   ceil(asize / 2) limbs. Stores cofactors in HGCD, and pushes the
-   quotients on STACK. On success, HGCD->row[0, 1, 2] correspond to
-   remainders that are larger than M limbs, while HGCD->row[3]
-   correspond to a remainder that fit in M limbs.
-
-   Return 0 on failure (if B or A mod B fits in M limbs), otherwise
-   return one of 1 - 4 as specified for hgcd_jebelean. */
-int
-mpn_hgcd (struct hgcd *hgcd,
-	  mp_srcptr ap, mp_size_t asize,
-	  mp_srcptr bp, mp_size_t bsize,
-	  struct qstack *quotients,
-	  mp_ptr tp, mp_size_t talloc)
-{
-  mp_size_t N = asize;
-  mp_size_t M = (N + 1)/2;
-  mp_size_t n;
-  mp_size_t m;
-
-  struct hgcd R;
-  mp_size_t itch;
+  return 24 * ((n+3) / 4) + 17 * k
+    + MPN_HGCD_LEHMER_ITCH (HGCD_THRESHOLD);
+}
 
-  ASSERT (M);
-#if WANT_TRACE
-  trace ("hgcd: asize = %d, bsize = %d, HGCD_SCHOENHAGE_THRESHOLD = %d\n",
-	 asize, bsize, HGCD_SCHOENHAGE_THRESHOLD);
-  if (asize < 100)
-    trace ("  a = %Nd\n"
-	   "  b = %Nd\n", ap, asize, bp, bsize);
-#endif
+/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
+   with elements of size at most (n+1)/2 - 1. Returns new size of a,
+   b, or zero if no reduction is possible. */
 
-  if (bsize <= M)
+mp_size_t
+mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n,
+	  struct hgcd_matrix *M, mp_ptr tp)
+{
+  mp_size_t s = n/2 + 1;
+  mp_size_t n2 = (3*n)/4 + 1;
+  
+  mp_size_t p, nn;
+  int success = 0;
+
+  if (n <= s)
+    /* Happens when n <= 2, a fairly uninteresting case but exercised
+       by the random inputs of the testsuite. */
     return 0;
 
-  ASSERT (asize >= 2);
-
-  /* Initialize, we keep r0 and r1 as the reduced numbers (so far). */
-  hgcd_start (hgcd, ap, asize, bp, bsize);
-
-  if (BELOW_THRESHOLD (N, HGCD_SCHOENHAGE_THRESHOLD))
-    return hgcd_final (hgcd, M, quotients);
+  ASSERT ((ap[n-1] | bp[n-1]) > 0);
 
-  /* Reduce the size to M + m + 1. Usually, only one hgcd call is
-     needed, but we may need multiple calls. When finished, the values
-     are stored in r0 (potentially large) and r1 (smaller size) */
+  ASSERT ((n+1)/2 - 1 < M->alloc);
 
-  n = N - M;
-  m = (n + 1)/2;
+  if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
+    return mpn_hgcd_lehmer (ap, bp, n, M, tp);
 
-  /* The second recursive call can use numbers of size up to n+3 */
-  itch = mpn_hgcd_init_itch (n+3);
-
-  ASSERT (itch <= talloc);
-  mpn_hgcd_init (&R, n+3, tp);
-  tp += itch; talloc -= itch;
-
-  while (hgcd->row[1].rsize > M + m + 1)
+  p = n/2;
+  nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
+  if (nn > 0)
     {
-      /* Max size after reduction, plus one */
-      mp_size_t ralloc = hgcd->row[1].rsize + 1;
-
-      int res = mpn_hgcd (&R,
-			  hgcd->row[0].rp + M, hgcd->row[0].rsize - M,
-			  hgcd->row[1].rp + M, hgcd->row[1].rsize - M,
-			  quotients, tp, talloc);
-
-      if (res == 0)
-	{
-	  /* We must divide to make progress */
-	  res = euclid_step (hgcd, M, quotients);
-
-	  if (res > 0)
-	    ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4);
-	  if (res >= 0)
-	    return res;
-
-	  ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2);
-	}
-      else if (res <= 2)
-	{
-	  /* The reason we use hgcd_adjust also when res == 2 is that
-	     either r2 is correct, and we get it for free.
-
-	     Or r2 is too large. Then can correct it by a few bignum
-	     subtractions, and we are *guaranteed* that the result is
-	     small enough that we don't need another run through this
-	     loop. */
-
-	  /* FIXME: For res == 1, the newly computed row[2] will be
-	     the same as the old row[1], so we do some unnecessary
-	     computations. */
-
-	  qstack_drop (quotients);
-
-	  /* Store new values in rows 2 and 3, to avoid overlap */
-	  hgcd->row[2].rsize
-	    = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc,
-			    ~R.sign, R.size, &R.row[1],
-			    hgcd->row[0].rp, hgcd->row[1].rp,
-			    tp, talloc);
-
-	  hgcd->row[3].rsize
-	    = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc,
-			    R.sign, R.size, &R.row[2],
-			    hgcd->row[0].rp, hgcd->row[1].rp,
-			    tp, talloc);
-
-	  ASSERT (hgcd->row[2].rsize > M);
-	  ASSERT (hgcd->row[3].rsize > M);
-
-	  /* Computes the uv matrix for the (possibly incorrect)
-	     values r1, r2. The elements must be smaller than the
-	     correct ones, since they correspond to a too small q. */
-
-	  hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc,
-				 R.row + 1, R.size,
-				 hgcd->row, hgcd->size,
-				 tp, talloc);
-	  hgcd->sign ^= ~R.sign;
-
-	  if (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize,
-			  hgcd->row[2].rp, hgcd->row[2].rsize))
-	    {
-	      ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4);
-
-	      HGCD_SWAP4_2 (hgcd->row);
-	    }
-	  else
-	    {
-	      /* r2 was too large, i.e. q0 too small. In this case we
-		 must have r2 % r1 <= r2 - r1 smaller than M + m + 1. */
-
-	      hgcd->size = hgcd_adjust (hgcd->row + 2, hgcd->size, quotients);
-	      ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4);
-
-	      ASSERT (hgcd->row[3].rsize <= M + m + 1);
-
-	      if (hgcd->row[3].rsize <= M)
-		{
-		  /* Backup two steps */
-		  ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size));
-
-		  return hgcd_small_2 (hgcd, M, quotients);
-		}
-
-	      HGCD_SWAP4_2 (hgcd->row);
-
-	      /* Loop always terminates here. */
-	      break;
-	    }
-	}
-      else if (res == 3)
-	{
-	  qstack_drop(quotients);
-
-	  ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2);
-
-	  /* Store new values in rows 2 and 3, to avoid overlap */
-	  hgcd->row[2].rsize
-	    = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc,
-			    ~R.sign, R.size, &R.row[1],
-			    hgcd->row[0].rp, hgcd->row[1].rp,
-			    tp, talloc);
-
-	  hgcd->row[3].rsize
-	    = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc,
-			    R.sign, R.size, &R.row[2],
-			    hgcd->row[0].rp, hgcd->row[1].rp,
-			    tp, talloc);
-
-	  ASSERT (hgcd->row[2].rsize > M);
-	  ASSERT (hgcd->row[3].rsize > M);
-
-	  hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc,
-				 R.row + 1, R.size,
-				 hgcd->row, hgcd->size,
-				 tp, talloc);
-	  hgcd->sign ^= ~R.sign;
-
-	  ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4);
-
-	  HGCD_SWAP4_2 (hgcd->row);
-	}
-      else
-	{
-	  ASSERT (res == 4);
-
-	  /* All of r0, r1, r3 and r3 are correct.
-	     Compute r2 and r3 */
-
-	  ASSERT_HGCD (&R,
-		       hgcd->row[0].rp + M, hgcd->row[0].rsize - M,
-		       hgcd->row[1].rp + M, hgcd->row[1].rsize - M,
-		       0, 4);
-
-	  /* Store new values in rows 2 and 3, to avoid overlap */
-	  hgcd->row[2].rsize
-	    = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc,
-			    R.sign, R.size, &R.row[2],
-			    hgcd->row[0].rp, hgcd->row[1].rp,
-			    tp, talloc);
-
-	  hgcd->row[3].rsize
-	    = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc,
-			    ~R.sign, R.size, &R.row[3],
-			    hgcd->row[0].rp, hgcd->row[1].rp,
-			    tp, talloc);
-
-	  ASSERT (hgcd->row[2].rsize > M);
-	  ASSERT (hgcd->row[3].rsize <= M + m + 1);
-
-	  hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc,
-				 R.row+2, R.size,
-				 hgcd->row, hgcd->size,
-				 tp, talloc);
-	  hgcd->sign ^= R.sign;
-
-	  ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4);
-
-	  if (hgcd->row[3].rsize <= M)
-	    {
-	      /* Backup two steps */
-	      /* Both steps must always be possible, but it's not
-		 trivial to ASSERT that here. */
-	      ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size));
-
-	      return hgcd_small_2 (hgcd, M, quotients);
-	    }
-	  HGCD_SWAP4_2 (hgcd->row);
-
-	  /* Always exit the loop. */
-	  break;
-	}
+      /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
+	 = 2 (n - 1) */
+      n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
+      success = 1;
     }
-
-  ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize);
-  ASSERT (hgcd->row[1].rsize > M);
-  ASSERT (hgcd->row[1].rsize <= M + m + 1);
-
-  if (hgcd->row[0].rsize > M + m + 1)
+  while (n > n2)
     {
-      /* One euclid step to reduce size. */
-      int res = euclid_step (hgcd, M, quotients);
-
-      if (res > 0)
-	ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4);
-      if (res >= 0)
-	return res;
-
-      ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2);
+      /* Needs n + 1 storage */
+      nn = hgcd_step (n, ap, bp, s, M, tp);
+      if (!nn)
+	return success ? n : 0;
+      n = nn;
+      success = 1;
     }
 
-  ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize);
-  ASSERT (hgcd->row[0].rsize <= M + m + 1);
-  ASSERT (hgcd->row[1].rsize > M);
-
-  /* Second phase, reduce size until we have one number of size > M
-     and one of size <= M+1 */
-  while (hgcd->row[1].rsize > M + 1)
+  if (n > s + 2)
     {
-      mp_size_t k = 2*M - hgcd->row[0].rsize;
-#if WANT_ASSERT
-      mp_size_t n1 = hgcd->row[0].rsize - k;
-#endif
-      mp_size_t qsize;
-      mp_srcptr qp;
-      int res;
-
-      ASSERT (k + (n1 + 1)/2 == M);
-      ASSERT (n1 >= 2);
-
-      ASSERT (n1 <= 2*(m + 1));
-      ASSERT (n1 <= n + 3);
-
-      res = mpn_hgcd (&R,
-		      hgcd->row[0].rp + k, hgcd->row[0].rsize - k,
-		      hgcd->row[1].rp + k, hgcd->row[1].rsize - k,
-		      quotients, tp, talloc);
-
-      if (res == 0)
-	{
-	  /* The first remainder was small. Then there's a good chance
-	     that the remainder A % B is also small. */
-	  res = euclid_step (hgcd, M, quotients);
+      struct hgcd_matrix M1;
+      mp_size_t scratch;
 
-	  if (res > 0)
-	    ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4);
-	  if (res >= 0)
-	    return res;
+      p = 2*s - n + 1;
+      scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
 
-	  ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2);
-	  continue;
-	}
-
-      if (res == 1)
+      mpn_hgcd_matrix_init(&M1, n - p, tp);
+      nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch);
+      if (nn > 0)
 	{
-	  mp_srcptr qp;
-	  mp_size_t qsize;
-
-	  qstack_drop (quotients);
-
-	  /* Compute possibly incorrect r2 and corresponding u2, v2.
-	     Incorrect matrix elements must be smaller than the
-	     correct ones, since they correspond to a too small q. */
-	  qsize = qstack_get_0 (quotients, &qp);
-
-	  ASSERT (qsize + hgcd->size <= hgcd->alloc);
-	  hgcd_update_r (hgcd->row, qp, qsize);
-	  hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
-				       qp, qsize);
-	  ASSERT (hgcd->size < hgcd->alloc);
-
-	  if (!MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize,
-			   hgcd->row[2].rp, hgcd->row[2].rsize))
-	    hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients);
-
-	  ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 3);
-
-	  if (hgcd->row[2].rsize <= M)
-	    {
-	      /* Backup one steps */
-	      ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size));
-
-	      return hgcd_small_1 (hgcd, M, quotients);
-	    }
-
-	  HGCD_SWAP4_LEFT (hgcd->row);
-	  hgcd->sign = ~hgcd->sign;
-	  continue;
+	  /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
+	  ASSERT (M->n + 2 >= M1.n);
+
+	  /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
+	     then either q or q + 1 is a correct quotient, and M1 will
+	     start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
+	     rules out the case that the size of M * M1 is much
+	     smaller than the expected M->n + M1->n. */
+
+	  ASSERT (M->n + M1.n < M->alloc);
+
+	  /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
+	     = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
+	  n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
+	  /* Needs 4 ceil(n/2) + 1 */
+	  mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
+	  success = 1;
 	}
-
-      /* Now r0 and r1 are always correct. */
-
-      /* It's possible that first two "new" r:s are the same as the
-	 old ones. In that case skip recomputing them. */
-
-      if (!hgcd_start_row_p (&R.row[0], R.size))
-	{
-	  /* Store new values in rows 2 and 3, to avoid overlap */
-	  hgcd->row[2].rsize
-	    = mpn_hgcd_fix (k, hgcd->row[2].rp, hgcd->row[0].rsize + 1,
-			    R.sign, R.size, &R.row[0],
-			    hgcd->row[0].rp, hgcd->row[1].rp,
-			    tp, talloc);
-
-	  hgcd->row[3].rsize
-	    = mpn_hgcd_fix (k, hgcd->row[3].rp, hgcd->row[1].rsize + 1,
-			    ~R.sign, R.size, &R.row[1],
-			    hgcd->row[0].rp, hgcd->row[1].rp,
-			    tp, talloc);
-
-	  ASSERT (hgcd->row[2].rsize > M);
-	  ASSERT (hgcd->row[3].rsize > k);
-
-	  hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc,
-				 R.row, R.size, hgcd->row, hgcd->size,
-				 tp, talloc);
-	  hgcd->sign ^= R.sign;
-
-	  ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4);
-
-	  if (hgcd->row[3].rsize <= M)
-	    {
-	      /* Backup two steps */
-
-	      /* We don't use R.row[2] and R.row[3], so drop the
-		 corresponding quotients. */
-	      qstack_drop (quotients);
-	      qstack_drop (quotients);
-
-	      return hgcd_small_2 (hgcd, M, quotients);
-	    }
-
-	  HGCD_SWAP4_2 (hgcd->row);
-
-	  if (res == 2)
-	    {
-	      qstack_drop (quotients);
-	      qstack_drop (quotients);
-
-	      continue;
-	    }
-	}
-
-      ASSERT (res >= 3);
-
-      /* We already know the correct q */
-      qsize = qstack_get_1 (quotients, &qp);
-
-      ASSERT (qsize + hgcd->size <= hgcd->alloc);
-      hgcd_update_r (hgcd->row, qp, qsize);
-      hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
-				   qp, qsize);
-      ASSERT (hgcd->size < hgcd->alloc);
-
-      ASSERT (hgcd->row[2].rsize > k);
-      if (hgcd->row[2].rsize <= M)
-	{
-	  /* Discard r3 */
-	  qstack_drop (quotients);
-	  return hgcd_small_1 (hgcd, M, quotients);
-	}
-      if (res == 3)
-	{
-	  /* Drop quotient for r3 */
-	  qstack_drop (quotients);
-	  hgcd->sign = ~hgcd->sign;
-	  HGCD_SWAP4_LEFT (hgcd->row);
-
-	  continue;
-	}
-
-      ASSERT (hgcd->row[2].rsize > M);
-      ASSERT (res == 4);
-
-      /* We already know the correct q */
-      qsize = qstack_get_0 (quotients, &qp);
-
-      ASSERT (qsize + hgcd->size <= hgcd->alloc);
-      hgcd_update_r (hgcd->row + 1, qp, qsize);
-      hgcd->size = hgcd_update_uv (hgcd->row + 1, hgcd->size,
-				   qp, qsize);
-      ASSERT (hgcd->size < hgcd->alloc);
-      ASSERT (hgcd->row[3].rsize <= M + 1);
-
-      if (hgcd->row[3].rsize <= M)
-	{
-#if WANT_ASSERT
-	  qstack_rotate (quotients, 0);
-#endif
-	  ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4);
-	  return hgcd_jebelean (hgcd, M);
-	}
-
-      HGCD_SWAP4_2 (hgcd->row);
     }
 
-  ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2);
+  /* This really is the base case */
+  for (;;)
+    {
+      /* Needs s+3 < n */
+      nn = hgcd_step (n, ap, bp, s, M, tp);
+      if (!nn)
+	return success ? n : 0;
 
-  return hgcd_final (hgcd, M, quotients);
+      n = nn;
+      success = 1;
+    } 
 }
diff --git a/mpn/generic/hgcd2.c b/mpn/generic/hgcd2.c
index 4ce579e8a..df6b94025 100644
--- a/mpn/generic/hgcd2.c
+++ b/mpn/generic/hgcd2.c
@@ -89,506 +89,201 @@ div2 (mp_ptr rp,
   return q;
 }
 #else /* GMP_NAIL_BITS != 0 */
-/* Two-limb division optimized for small quotients. Input words
-   include nails, which must be zero. */
-static inline mp_limb_t
-div2 (mp_ptr rp,
-      mp_limb_t nh, mp_limb_t nl,
-      mp_limb_t dh, mp_limb_t dl)
+/* Check all functions for nail support. */
+/* hgcd2 should be defined to take inputs including nail bits, and
+   produce a matrix with elements also including nail bits. This is
+   necessary, for the matrix elements to be useful with mpn_mul_1,
+   mpn_addmul_1 and friends. */
+#error Not implemented
+#endif /* GMP_NAIL_BITS != 0 */
+
+/* Reduces a,b until |a-b| fits in one limb + 1 bit. Constructs
+   matrix M. Returns 1 if we make progress, i.e. can perform at least
+   one subtraction. Otherwise returns zero.. */
+
+/* FIXME: Possible optimizations:
+
+   The div2 function starts with checking the most significant bit of
+   the numerator. We can maintained normalized operands here, call
+   hgcd with normalized operands only, which should make the code
+   simpler and possibly faster.
+
+   Experiment with table lookups on the most significant bits.
+
+   This function is also a candidate for assembler implementation.
+*/
+int
+mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
+	   struct hgcd_matrix1 *M)
 {
-  mp_limb_t q = 0;
-  int cnt;
-
-  ASSERT_LIMB(nh);
-  ASSERT_LIMB(nl);
-  ASSERT_LIMB(dh);
-  ASSERT_LIMB(dl);
-
-  /* FIXME: Always called with nh > 0 and dh >0. Then it should be
-     enough to look at the high limbs to select cnt. */
-  for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++)
-  {
-    dh = (dh << 1) | (dl >> (GMP_NUMB_BITS - 1));
-    dl = (dl << 1) & GMP_NUMB_MASK;
-  }
-
-  while (cnt)
+  mp_limb_t u00, u01, u10, u11;
+  
+  if (ah < 2 || bh < 2)
+    return 0;
+
+  if (ah > bh || (ah == bh && al > bl))
     {
-      dl = (dh << (GMP_NUMB_BITS - 1)) | (dl >> 1);
-      dh = dh >> 1;
-      dl &= GMP_NUMB_MASK;
-
-      q <<= 1;
-      if (nh > dh || (nh == dh && nl >= dl))
-       {
-	 /* FIXME: We could perhaps optimize this by unrolling the
-	    loop 2^GMP_NUMB_BITS - 1 times? */
-	 nl -= dl;
-	 nh -= dh;
-	 nh -= (nl >> (GMP_LIMB_BITS - 1));
-	 nl &= GMP_NUMB_MASK;
-
-	 q |= 1;
-       }
-      cnt--;
+      sub_ddmmss (ah, al, ah, al, bh, bl);
+      if (ah < 2)
+	return 0;
+
+      u00 = u01 = u11 = 1;
+      u10 = 0;
     }
-  ASSERT (nh < dh || (nh == dh && nl < dl));
-  rp[0] = nl;
-  rp[1] = nh;
+  else
+    {
+      sub_ddmmss (bh, bl, bh, bl, ah, al);
+      if (bh < 2)
+	return 0;
 
-  return q;
-}
-#endif /* GMP_NAIL_BITS != 0 */
+      u00 = u10 = u11 = 1;
+      u01 = 0;
+    }
 
-#define SUB_2(w1,w0, x1,x0, y1,y0)                      \
-  do {                                                  \
-    ASSERT_LIMB (x1);                                   \
-    ASSERT_LIMB (x0);                                   \
-    ASSERT_LIMB (y1);                                   \
-    ASSERT_LIMB (y0);                                   \
-                                                        \
-    if (GMP_NAIL_BITS == 0)                             \
-      sub_ddmmss (w1,w0, x1,x0, y1,y0);                 \
-    else                                                \
-      {                                                 \
-        mp_limb_t   __w0, __c;                          \
-        SUBC_LIMB (__c, __w0, x0, y0);                  \
-        (w1) = ((x1) - (y1) - __c) & GMP_NUMB_MASK;     \
-        (w0) = __w0;                                    \
-      }                                                 \
-  } while (0)
-
-static inline void
-qstack_push_0 (struct qstack *stack)
-{
-  ASSERT_QSTACK (stack);
+  if (ah < bh)
+    goto subtract_a;
 
-  if (stack->size_next >= QSTACK_MAX_QUOTIENTS)
-    qstack_rotate (stack, 0);
+  for (;;)
+    {
+      ASSERT (ah >= bh);
+      if (ah == bh)
+	break;
 
-  stack->size[stack->size_next++] = 0;
-}
+      /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
+	 1), affecting the second column of M. */
+      ASSERT (ah > bh);
+      sub_ddmmss (ah, al, ah, al, bh, bl);
 
-static inline void
-qstack_push_1 (struct qstack *stack, mp_limb_t q)
-{
-  ASSERT (q >= 2);
+      if (ah < 2)
+	break;
 
-  ASSERT_QSTACK (stack);
+      if (ah <= bh)
+	{
+	  /* Use q = 1 */
+	  u01 += u00;
+	  u11 += u10;
+	}
+      else
+	{
+	  mp_limb_t r[2];
+	  mp_limb_t q = div2 (r, ah, al, bh, bl);
+	  al = r[0]; ah = r[1];
+	  if (ah < 2)
+	    {
+	      /* A is too small, but q is correct. */
+	      u01 += q * u00;
+	      u11 += q * u10;
+	      break;
+	    }
+	  q++;
+	  u01 += q * u00;
+	  u11 += q * u10;
+	}
+    subtract_a:
+      ASSERT (bh >= ah);
+      if (ah == bh)
+	break;
 
-  if (stack->limb_next >= stack->limb_alloc)
-    qstack_rotate (stack, 1);
+      /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
+	 1), affecting the first column of M. */
+      sub_ddmmss (bh, bl, bh, bl, ah, al);
 
-  else if (stack->size_next >= QSTACK_MAX_QUOTIENTS)
-    qstack_rotate (stack, 0);
+      if (bh < 2)
+	break;
 
-  stack->size[stack->size_next++] = 1;
-  stack->limb[stack->limb_next++] = q;
+      if (bh <= ah)
+	{
+	  /* Use q = 1 */
+	  u00 += u01;
+	  u10 += u11;
+	}
+      else
+	{
+	  mp_limb_t r[2];
+	  mp_limb_t q = div2 (r, bh, bl, ah, al);
+	  bl = r[0]; bh = r[1];
+	  if (bh < 2)
+	    {
+	      /* B is too small, but q is correct. */
+	      u00 += q * u01;
+	      u10 += q * u11;
+	      break;
+	    }
+	  q++;
+	  u00 += q * u01;
+	  u10 += q * u11;
+	}
+    }
+  M->u[0][0] = u00; M->u[0][1] = u01;
+  M->u[1][0] = u10; M->u[1][1] = u11;
 
-  ASSERT_QSTACK (stack);
+  return 1;
 }
 
-/* Produce r_k from r_i and r_j, and push the corresponding
-   quotient. */
-#if __GMP_HAVE_TOKEN_PASTE
-#define HGCD2_STEP(i, j, k) do {			\
-  SUB_2 (rh ## k, rl ## k,				\
-	 rh ## i, rl ## i,				\
-	 rh ## j, rl ## j);				\
-							\
-  /* Could check here for the special case rh3 == 0,	\
-     but it's covered by the below condition as well */	\
-  if (       rh ## k <  rh ## j				\
-      || (   rh ## k == rh ## j				\
-	  && rl ## k <  rl ## j))			\
-    {							\
-	  /* Unit quotient */				\
-	  u ## k = u ## i + u ## j;			\
-	  v ## k = v ## i + v ## j;			\
-							\
-	  if (quotients)				\
-	    qstack_push_0 (quotients);			\
-	}						\
-      else						\
-	{						\
-	  mp_limb_t r[2];				\
-	  mp_limb_t q = 1 + div2 (r, rh ## k, rl ## k,	\
-				     rh ## j, rl ## j);	\
-	  rl ## k = r[0]; rh ## k = r[1];		\
-	  u ## k = u ## i + q * u ## j;			\
-	  v ## k = v ## i + q * v ## j;			\
-							\
-	  if (quotients)				\
-	    qstack_push_1 (quotients, q);		\
-	}						\
-} while (0)
-#else /* ! __GMP_HAVE_TOKEN_PASTE */
-#define HGCD2_STEP(i, j, k) do {			\
-  SUB_2 (rh/**/k, rl/**/k,				\
-	 rh/**/i, rl/**/i,				\
-	 rh/**/j, rl/**/j);				\
-							\
-  /* Could check here for the special case rh3 == 0,	\
-     but it's covered by the below condition as well */	\
-  if (       rh/**/k <  rh/**/j				\
-      || (   rh/**/k == rh/**/j				\
-	  && rl/**/k <  rl/**/j))			\
-    {							\
-	  /* Unit quotient */				\
-	  u/**/k = u/**/i + u/**/j;			\
-	  v/**/k = v/**/i + v/**/j;			\
-							\
-	  if (quotients)				\
-	    qstack_push_0 (quotients);			\
-	}						\
-      else						\
-	{						\
-	  mp_limb_t r[2];				\
-	  mp_limb_t q = 1 + div2 (r, rh/**/k, rl/**/k,	\
-				     rh/**/j, rl/**/j);	\
-	  rl/**/k = r[0]; rh/**/k = r[1];		\
-	  u/**/k = u/**/i + q * u/**/j;			\
-	  v/**/k = v/**/i + q * v/**/j;			\
-							\
-	  if (quotients)				\
-	    qstack_push_1 (quotients, q);		\
-	}						\
-} while (0)
-#endif /* ! __GMP_HAVE_TOKEN_PASTE */
-
-/* Repeatedly divides A by B, until the remainder is a single limb.
-   Stores cofactors in HGCD, and pushes the quotients on STACK (unless
-   STACK is NULL). On success, HGCD->row[0, 1, 2] correspond to
-   remainders that are larger than one limb, while HGCD->row[3]
-   correspond to a remainder that fit in a single limb.
-
-   Return 0 on failure (if B or A mod B fits in a single limb). Return
-   1 if r0 and r1 are correct, but we still make no progress because
-   r0 = A, r1 = B.
-
-   Otherwise return 2, 3 or 4 depending on how many of the r:s that
-   satisfy Jebelean's criterion. */
-/* FIXME: There are two more micro optimizations that could be done to
-   this code:
+/* Multiply (a;b) by M = (u00, u01; u10, u11). Needs n limbs of
+   temporary storage. Vector must have space for n + 1 limbs. */
+mp_size_t
+mpn_hgcd_mul_matrix1_vector (struct hgcd_matrix1 *M, mp_size_t n,
+			     mp_ptr ap, mp_ptr bp, mp_ptr tp)
+{
+  mp_limb_t ah, bh;
 
-   The div2 function starts with checking the most significant bit of
-   the numerator. When we call div2, that bit is know in advance for
-   all but the one or two first calls, so we could split div2 in two
-   functions, and call the right one.
+  /* Compute (a,b) <-- (u00 a + u10 b, u01 a + u11 b) as
 
-   We could also have two versions of this code, with and without the
-   quotient argument, to avoid checking if it's NULL in the middle of
-   the loop. */
+     t  = a
+     a *= u00
+     a += u10 * b
+     b *= u11
+     b += u01 * t
+  */
 
-int
-mpn_hgcd2 (struct hgcd2 *hgcd,
-	   mp_limb_t ah, mp_limb_t al,
-	   mp_limb_t bh, mp_limb_t bl,
-	   struct qstack *quotients)
-{
-  /* For all divisions, we special case q = 1, which accounts for
-     approximately 41% of the quotients for random numbers (Knuth,
-     TAOCP 4.5.3) */
-
-  /* Use scalar variables */
-  mp_limb_t rh1, rl1, u1, v1;
-  mp_limb_t rh2, rl2, u2, v2;
-  mp_limb_t rh3, rl3, u3, v3;
-
-  ASSERT_LIMB(ah);
-  ASSERT_LIMB(al);
-  ASSERT_LIMB(bh);
-  ASSERT_LIMB(bl);
-  ASSERT (ah > bh || (ah == bh && al >= bl));
-
-  if (bh == 0)
-    return 0;
+  /* This copying could be avoided if we let our caller swap some
+   * pointers. */
+  MPN_COPY (tp, ap, n);
 
-  {
-    mp_limb_t rh0, rl0, u0, v0;
-
-    /* Initialize first two rows */
-    rh0 = ah; rl0 = al; u0 = 1; v0 = 0;
-    rh1 = bh; rl1 = bl; u1 = 0; v1 = 1;
-
-    SUB_2 (rh2, rl2, rh0, rl0, rh1, rl1);
-
-    if (rh2 == 0)
-      return 0;
-
-    if (rh2 < rh1 || (rh2 == rh1 && rl2 <  rl1))
-      {
-	/* Unit quotient */
-	v2 = 1;
-
-	if (quotients)
-	  qstack_push_0 (quotients);
-      }
-    else
-      {
-	mp_limb_t r[2];
-	mp_limb_t q = 1 + div2 (r, rh2, rl2, rh1, rl1);
-
-	rl2 = r[0]; rh2 = r[1];
-
-	if (rh2 == 0)
-	  return 0;
-
-	v2 = q;
-
-	if (quotients)
-	  qstack_push_1 (quotients, q);
-      }
-
-    u2 = 1;
-
-    /* The simple version of the loop is as follows:
-     |
-     |   hgcd->sign = 0;
-     |   for (;;)
-     |     {
-     |       (q, rh3, rl3]) = divmod (r1, r2);
-     |       u[3] = u1 + q * u2;
-     |       v[3] = v1 + q * v2;
-     |       qstack_push_1 (quotients, q);
-     |
-     |       if (rh3 == 0)
-     |         break;
-     |
-     |       HGCD2_SHIFT4_LEFT (hgcd->row);
-     |       hgcd->sign = ~hgcd->sign;
-     |     }
-     |
-     |   But then we special case for q = 1, and unroll the loop four times
-     |   to avoid data movement. */
-
-    for (;;)
-      {
-	HGCD2_STEP (1, 2, 3);
-	if (rh3 == 0)
-	  {
-	    hgcd->row[0].u = u0; hgcd->row[0].v = v0;
-
-	    hgcd->sign = 0;
-
-	    break;
-	  }
-	HGCD2_STEP (2, 3, 0);
-	if (rh0 == 0)
-	  {
-	    hgcd->row[0].u = u1; hgcd->row[0].v = v1;
-
-	    rh1 = rh2; rl1 = rl2; u1 = u2; v1 = v2;
-	    rh2 = rh3; rl2 = rl3; u2 = u3; v2 = v3;
-	    rh3 = rh0; rl3 = rl0; u3 = u0; v3 = v0;
-
-	    hgcd->sign = -1;
-	    break;
-	  }
-
-	HGCD2_STEP (3, 0, 1);
-	if (rh1 == 0)
-	  {
-	    hgcd->row[0].u = u2; hgcd->row[0].v = v2;
-	    rh2 = rh0; rl2 = rl0; u2 = u0; v2 = v0;
-
-	    MP_LIMB_T_SWAP (rh1, rh3); MP_LIMB_T_SWAP (rl1, rl3);
-	    MP_LIMB_T_SWAP ( u1,  u3); MP_LIMB_T_SWAP ( v1,  v3);
-
-	    hgcd->sign = 0;
-	    break;
-	  }
-
-	HGCD2_STEP (0, 1, 2);
-	if (rh2 == 0)
-	  {
-	    hgcd->row[0].u = u3; hgcd->row[0].v = v3;
-
-	    rh3 = rh2; rl3 = rl2; u3 = u2; v3 = v2;
-	    rh2 = rh1; rl2 = rl1; u2 = u1; v2 = v1;
-	    rh1 = rh0; rl1 = rl0; u1 = u0; v1 = v0;
-
-	    hgcd->sign = -1;
-	    break;
-	  }
-      }
-  }
-
-  ASSERT (rh1 != 0);
-  ASSERT (rh2 != 0);
-  ASSERT (rh3 == 0);
-  ASSERT (rh1 > rh2 || (rh1 == rh2 && rl1 > rl2));
-  ASSERT (rh2 > rh3 || (rh2 == rh3 && rl2 > rl3));
-
-  /* Coefficients to be returned */
-  hgcd->row[1].u = u1; hgcd->row[1].v = v1;
-  hgcd->row[2].u = u2; hgcd->row[2].v = v2;
-  hgcd->row[3].u = u3; hgcd->row[3].v = v3;
-
-  /* Rows 1, 2 and 3 are used below, rh0, rl0, u0 and v0 are not. */
-#if GMP_NAIL_BITS == 0
-  {
-    mp_limb_t sh;
-    mp_limb_t sl;
-    mp_limb_t th;
-    mp_limb_t tl;
-
-    /* Check r2 */
-    /* We always have r2 > u2, v2 */
-
-    if (hgcd->sign >= 0)
-      {
-	/* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */
-	sl = u2 + u1;
-	sh = (sl < u1);
-      }
-    else
-      {
-	/* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */
-	sl = v2 + v1;
-	sh = (sl < v1);
-      }
-
-    sub_ddmmss (th, tl, rh1, rl1, rh2, rl2);
-
-    if (th < sh || (th == sh && tl < sl))
-      return 2 - (hgcd->row[0].v == 0);
-
-    /* Check r3 */
-
-    if (hgcd->sign >= 0)
-      {
-	/* Check r3 >= max (-u3, -v3) = |u3| */
-	if (rl3 < u3)
-	  return 3;
-
-	/* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/
-	sl = v3 + v2;
-	sh = (sl < v2);
-      }
-    else
-      {
-	/* Check r3 >= max (-u3, -v3) = |v3| */
-	if (rl3 < v3)
-	  return 3;
-
-	/* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */
-	sl = u3 + u2;
-	sh = (sl < u2);
-      }
-
-    sub_ddmmss (th, tl, rh2, rl2, 0, rl3);
-
-    if (th < sh || (th == sh && tl < sl))
-      return 3;
-
-    return 4;
-  }
-#else /* GMP_NAIL_BITS > 0 */
-  {
-    mp_limb_t sl;
-    mp_limb_t th;
-    mp_limb_t tl;
-
-    /* Check r2 */
-    /* We always have r2 > u2, v2 */
-
-    if (hgcd->sign >= 0)
-      {
-       /* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */
-       sl = u2 + u1;
-      }
-    else
-      {
-       /* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */
-       sl = v2 + v1;
-      }
-
-    tl = rl1 - rl2;
-    th = rh1 - rh2 - (tl >> (GMP_LIMB_BITS - 1));
-    ASSERT_LIMB(th);
-
-    if (th < (CNST_LIMB(1) << GMP_NAIL_BITS)
-       && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl)
-      return 2 - (hgcd->row[0].v == 0);
-
-    /* Check r3 */
-
-    if (hgcd->sign >= 0)
-      {
-       /* Check r3 >= max (-u3, -v3) = |u3| */
-       if (rl3 < u3)
-	 return 3;
-
-       /* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/
-       sl = v3 + v2;
-      }
-    else
-      {
-       /* Check r3 >= max (-u3, -v3) = |v3| */
-       if (rl3 < v3)
-	 return 3;
-
-       /* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */
-       sl = u3 + u2;
-      }
-
-    tl = rl2 - rl3;
-    th = rh2 - (tl >> (GMP_LIMB_BITS - 1));
-    ASSERT_LIMB(th);
-
-    if (th < (CNST_LIMB(1) << GMP_NAIL_BITS)
-       && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl)
-      return 3;
-
-    return 4;
-  }
-#endif /* GMP_NAIL_BITS > 0 */
+  ah =     mpn_mul_1 (ap, ap, n, M->u[0][0]);
+  ah += mpn_addmul_1 (ap, bp, n, M->u[1][0]);
+
+  bh =     mpn_mul_1 (bp, bp, n, M->u[1][1]);
+  bh += mpn_addmul_1 (bp, tp, n, M->u[0][1]);
+
+  ap[n] = ah;
+  bp[n] = bh;
+  
+  n += (ap[n] | bp[n]) > 0;
+  return n;
 }
 
+/* Multiply (a;b) by M^{-1} = (u11, -u01; -u10, u00) from the left.
+   Needs n limbs of temporary storage. */
 mp_size_t
-mpn_hgcd2_fix (mp_ptr rp, mp_size_t ralloc,
-	       int sign,
-	       mp_limb_t u, mp_srcptr ap, mp_size_t asize,
-	       mp_limb_t v, mp_srcptr bp, mp_size_t bsize)
+mpn_hgcd_mul_matrix1_inverse_vector (struct hgcd_matrix1 *M, mp_size_t n,
+				     mp_ptr ap, mp_ptr bp, mp_ptr tp)
 {
-  mp_size_t rsize;
-  mp_limb_t cy;
+  mp_limb_t h0, h1;
 
-  ASSERT_LIMB(u);
-  ASSERT_LIMB(v);
+  /* Compute (a;b) <-- (u11 a - u01 b; -u10 a + u00 b) as
 
-  if (sign < 0)
-    {
-      MP_LIMB_T_SWAP (u,v);
-      MPN_SRCPTR_SWAP (ap, asize, bp, bsize);
-    }
+     t  = a
+     a *= u11
+     a -= u01 * b
+     b *= u00
+     b -= u10 * t
+  */
 
-  ASSERT (u > 0);
+  /* This copying could be avoided if we let our caller swap some
+   * pointers. */
+  MPN_COPY (tp, ap, n);
 
-  ASSERT (asize <= ralloc);
-  rsize = asize;
-  cy = mpn_mul_1 (rp, ap, asize, u);
-  if (cy)
-    {
-      ASSERT (rsize < ralloc);
-      rp[rsize++] = cy;
-    }
+  h0 =    mpn_mul_1 (ap, ap, n, M->u[1][1]);
+  h1 = mpn_submul_1 (ap, bp, n, M->u[0][1]);
+  ASSERT (h0 == h1);
 
-  if (v > 0)
-    {
-      ASSERT (bsize <= rsize);
-      cy = mpn_submul_1 (rp, bp, bsize, v);
-      if (cy)
-	{
-	  ASSERT (bsize < rsize);
-	  ASSERT_NOCARRY (mpn_sub_1 (rp + bsize,
-				     rp + bsize, rsize - bsize, cy));
-	}
+  h0 =    mpn_mul_1 (bp, bp, n, M->u[0][0]);
+  h1 = mpn_submul_1 (bp, tp, n, M->u[1][0]);
+  ASSERT (h0 == h1);
 
-      MPN_NORMALIZE (rp, rsize);
-    }
-  return rsize;
+  n -= (ap[n-1] | bp[n-1]) == 0;
+  return n;
 }
 
-#undef HGCD2_STEP
diff --git a/mpn/generic/matrix22_mul.c b/mpn/generic/matrix22_mul.c
new file mode 100644
index 000000000..0b8b61303
--- /dev/null
+++ b/mpn/generic/matrix22_mul.c
@@ -0,0 +1,254 @@
+/* matrix22_mul.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define MUL(rp, ap, an, bp, bn) do {		\
+  if (an >= bn)					\
+    mpn_mul (rp, ap, an, bp, bn);		\
+  else						\
+    mpn_mul (rp, bp, bn, ap, an);		\
+} while (0)
+
+/* Inputs are unsigned. */
+static int
+abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
+{
+  int c;
+  MPN_CMP (c, ap, bp, n);
+  if (c >= 0)
+    {
+      mpn_sub_n (rp, ap, bp, n);
+      return 0;
+    }
+  else
+    {
+      mpn_sub_n (rp, bp, ap, n);
+      return 1;
+    }
+}
+
+static int
+add_signed_n (mp_ptr rp,
+	      mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
+{
+  if (as != bs)
+    return as ^ abs_sub_n (rp, ap, bp, n);
+  else
+    {
+      ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
+      return as;
+    }      
+}
+
+mp_size_t
+mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
+{
+  if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
+      || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
+    return 3*rn + 2*mn;
+  else
+    return 4*(rn + mn) + 5;
+}
+
+/* Algorithm:
+    
+    / s0 \   /  1  0  0  0 \ / r0 \
+    | s1 |   |  0  1  0  0 | | r1 |
+    | s2 |   |  0  0  1  1 | | r2 |
+    | s3 | = | -1  0  1  1 | \ r3 /
+    | s4 |   |  1  0 -1  0 |
+    | s5 |   |  1  1 -1 -1 |
+    \ s6 /   \  0  0  0  1 /
+
+    / t0 \   /  1  0  0  0 \ / m0 \
+    | t1 |   |  0  0  1  0 | | m1 |
+    | t2 |   | -1  1  0  0 | | m2 |
+    | t3 | = |  1 -1  0  1 | \ m3 /
+    | t4 |   |  0 -1  0  1 |
+    | t5 |   |  0  0  0  1 |
+    \ t6 /   \ -1  1  1 -1 /
+
+    / r0 \   / 1 1 0 0 0 0 0 \ / s0 * t0 \
+    | r1 | = | 1 0 1 1 0 1 0 | | s1 * t1 |
+    | r2 |   | 1 0 0 1 1 0 1 | | s2 * t2 |
+    \ r3 /   \ 1 0 1 1 1 0 0 / | s3 * t3 |
+			       | s4 * t4 |
+			       | s5 * t5 |
+			       \ s6 * t6 /
+*/
+
+/* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
+ *
+ * Resulting elements are of size up to rn + mn + 1.
+ *
+ * Temporary storage: 4 rn + 4 mn + 5. */
+void
+mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
+			   mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
+			   mp_ptr tp)
+{
+  mp_ptr s2, s3, t2, t3, u0, u1;
+  int r2s, r3s, s3s, t2s, t3s, u0s, u1s;
+  s2 = tp; tp += rn;
+  s3 = tp; tp += rn + 1;
+  t2 = tp; tp += mn;
+  t3 = tp; tp += mn + 1;
+  u0 = tp; tp += rn + mn + 1;
+  u1 = tp; /* rn + mn + 2 */
+
+  MUL (u0, r0, rn, m0, mn); /* 0 */
+  MUL (u1, r1, rn, m2, mn); /* 1 */
+
+  MPN_COPY (s2, r3, rn);
+      
+  r3[rn] = mpn_add_n (r3, r3, r2, rn);
+  r0[rn] = 0;
+  s3s = abs_sub_n (s3, r3, r0, rn + 1);
+  t2s = abs_sub_n (t2, m1, m0, mn);
+  if (t2s)
+    {
+      t3[mn] = mpn_add_n (t3, m3, t2, mn);
+      t3s = 0;
+    }
+  else
+    {
+      t3s = abs_sub_n (t3, m3, t2, mn);
+      t3[mn] = 0;
+    }
+
+  r2s = abs_sub_n (r2, r0, r2, rn);
+  r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
+
+  MUL(u1, s3, rn+1, t3, mn+1); /* 3 */
+  u1s = s3s ^ t3s;
+  ASSERT (u1[rn+mn+1] == 0);
+  ASSERT (u1[rn+mn] < 4);
+
+  if (u1s)
+    {
+      u0[rn+mn] = 0;      
+      u0s = abs_sub_n (u0, u0, u1, rn + mn + 1);
+    }
+  else
+    {
+      u0[rn+mn] = u1[rn+mn] + mpn_add_n (u0, u0, u1, rn + mn);
+      u0s = 0;
+    }
+  MUL(u1, r3, rn + 1, t2, mn); /* 2 */
+  u1s = t2s;
+  ASSERT (u1[rn+mn] < 2);
+
+  u1s = add_signed_n (u1, u0, u0s, u1, u1s, rn + mn + 1);
+
+  t2s = abs_sub_n (t2, m3, m1, mn);
+  if (s3s)
+    {
+      s3[rn] += mpn_add_n (s3, s3, r1, rn);
+      s3s = 0;
+    }
+  else if (s3[rn] > 0)
+    {
+      s3[rn] -= mpn_sub_n (s3, s3, r1, rn);
+      s3s = 1;
+    }
+  else
+    {
+      s3s = abs_sub_n (s3, r1, s3, rn);
+    }
+  MUL (r1, s3, rn+1, m3, mn); /* 5 */
+  ASSERT_NOCARRY(add_signed_n (r1, r1, s3s, u1, u1s, rn + mn + 1));
+  ASSERT (r1[rn + mn] < 2);
+
+  MUL (r3, r2, rn, t2, mn); /* 4 */
+  r3s = r2s ^ t2s;
+  r3[rn + mn] = 0;
+  u0s = add_signed_n (u0, u0, u0s, r3, r3s, rn + mn + 1);
+  ASSERT_NOCARRY (add_signed_n (r3, r3, r3s, u1, u1s, rn + mn + 1));
+  ASSERT (r3[rn + mn] < 2);
+
+  if (t3s)
+    {
+      t3[mn] += mpn_add_n (t3, m2, t3, mn);
+      t3s = 0;
+    }
+  else if (t3[mn] > 0)
+    {
+      t3[mn] -= mpn_sub_n (t3, t3, m2, mn);
+      t3s = 1;
+    }
+  else
+    {
+      t3s = abs_sub_n (t3, m2, t3, mn);
+    }
+  MUL (r2, s2, rn, t3, mn + 1); /* 6 */
+
+  ASSERT_NOCARRY (add_signed_n (r2, r2, t3s, u0, u0s, rn + mn + 1));
+  ASSERT (r2[rn + mn] < 2);
+}
+
+void
+mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
+		  mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
+		  mp_ptr tp)
+{
+  if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
+      || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
+    {
+      mp_ptr p0, p1;
+      unsigned i;
+
+      /* Temporary storage: 3 rn + 2 mn */
+      p0 = tp + rn;
+      p1 = p0 + rn + mn;
+
+      for (i = 0; i < 2; i++)
+	{
+	  MPN_COPY (tp, r0, rn);
+
+	  if (rn >= mn)
+	    {
+	      mpn_mul (p0, r0, rn, m0, mn);
+	      mpn_mul (p1, r1, rn, m3, mn);
+	      mpn_mul (r0, r1, rn, m2, mn);
+	      mpn_mul (r1, tp, rn, m1, mn);
+	    }
+	  else
+	    {
+	      mpn_mul (p0, m0, mn, r0, rn);
+	      mpn_mul (p1, m3, mn, r1, rn);
+	      mpn_mul (r0, m2, mn, r1, rn);
+	      mpn_mul (r1, m1, mn, tp, rn);
+	    }
+	  r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
+	  r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
+
+	  r0 = r2; r1 = r3;
+	}
+    }
+  else
+    mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
+			       m0, m1, m2, m3, mn, tp);
+}
diff --git a/mpn/ia64/gmp-mparam.h b/mpn/ia64/gmp-mparam.h
index 8dd018237..22a8cfff8 100644
--- a/mpn/ia64/gmp-mparam.h
+++ b/mpn/ia64/gmp-mparam.h
@@ -37,9 +37,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 72
 #define POWM_THRESHOLD                  295
 
-#define HGCD_SCHOENHAGE_THRESHOLD       191
+#define HGCD_THRESHOLD                  191
 #define GCD_ACCEL_THRESHOLD              10
-#define GCD_SCHOENHAGE_THRESHOLD        336
+#define GCD_DC_THRESHOLD                336
 #define GCDEXT_SCHOENHAGE_THRESHOLD     649
 #define JACOBI_BASE_METHOD                1
 
diff --git a/mpn/m68k/gmp-mparam.h b/mpn/m68k/gmp-mparam.h
index c18bc5a63..c62304653 100644
--- a/mpn/m68k/gmp-mparam.h
+++ b/mpn/m68k/gmp-mparam.h
@@ -37,10 +37,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 55
 #define POWM_THRESHOLD                   65
 
-#define HGCD_SCHOENHAGE_THRESHOLD       116
+#define HGCD_THRESHOLD                  116
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        590
-#define GCDEXT_THRESHOLD                 35
+#define GCD_DC_THRESHOLD                590
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD       MP_SIZE_T_MAX  /* never */
diff --git a/mpn/minithres/gmp-mparam.h b/mpn/minithres/gmp-mparam.h
index 7586b7a0f..31b74337b 100644
--- a/mpn/minithres/gmp-mparam.h
+++ b/mpn/minithres/gmp-mparam.h
@@ -33,9 +33,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                  6
 #define POWM_THRESHOLD                    4
 
-#define HGCD_SCHOENHAGE_THRESHOLD        10
+#define HGCD_THRESHOLD                   10
 #define GCD_ACCEL_THRESHOLD               2
-#define GCD_SCHOENHAGE_THRESHOLD         20
+#define GCD_DC_THRESHOLD                 20
 #define GCDEXT_SCHOENHAGE_THRESHOLD      20
 #define JACOBI_BASE_METHOD                1
 
diff --git a/mpn/mips32/gmp-mparam.h b/mpn/mips32/gmp-mparam.h
index a5b736de3..d86fd3f01 100644
--- a/mpn/mips32/gmp-mparam.h
+++ b/mpn/mips32/gmp-mparam.h
@@ -37,7 +37,6 @@ with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define POWM_THRESHOLD                   78
 
 #define GCD_ACCEL_THRESHOLD               3
-#define GCDEXT_THRESHOLD                 18
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD           0  /* always */
diff --git a/mpn/mips64/gmp-mparam.h b/mpn/mips64/gmp-mparam.h
index 23b012149..d189e895c 100644
--- a/mpn/mips64/gmp-mparam.h
+++ b/mpn/mips64/gmp-mparam.h
@@ -36,10 +36,9 @@ with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 53
 #define POWM_THRESHOLD                   61
 
-#define HGCD_SCHOENHAGE_THRESHOLD       116
+#define HGCD_THRESHOLD                  116
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        492
-#define GCDEXT_THRESHOLD                  0  /* always */
+#define GCD_DC_THRESHOLD                492
 #define JACOBI_BASE_METHOD                2
 
 #define MOD_1_NORM_THRESHOLD              0  /* always */
diff --git a/mpn/pa32/gmp-mparam.h b/mpn/pa32/gmp-mparam.h
index 3c6d36c57..005539c0d 100644
--- a/mpn/pa32/gmp-mparam.h
+++ b/mpn/pa32/gmp-mparam.h
@@ -49,6 +49,5 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #ifndef GCD_ACCEL_THRESHOLD
 #define GCD_ACCEL_THRESHOLD       46
 #endif
-#ifndef GCDEXT_THRESHOLD
 #define GCDEXT_THRESHOLD          33
 #endif
diff --git a/mpn/pa32/hppa1_1/gmp-mparam.h b/mpn/pa32/hppa1_1/gmp-mparam.h
index d3d6d4436..5ced74548 100644
--- a/mpn/pa32/hppa1_1/gmp-mparam.h
+++ b/mpn/pa32/hppa1_1/gmp-mparam.h
@@ -34,10 +34,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 84
 #define POWM_THRESHOLD                  166
 
-#define HGCD_SCHOENHAGE_THRESHOLD       231
+#define HGCD_THRESHOLD                  231
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        823
-#define GCDEXT_THRESHOLD                  0  /* always */
+#define GCD_DC_THRESHOLD                823
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD           5
diff --git a/mpn/pa32/hppa2_0/gmp-mparam.h b/mpn/pa32/hppa2_0/gmp-mparam.h
index 29ea97506..f5667840a 100644
--- a/mpn/pa32/hppa2_0/gmp-mparam.h
+++ b/mpn/pa32/hppa2_0/gmp-mparam.h
@@ -35,7 +35,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define POWM_THRESHOLD               166
 
 #define GCD_ACCEL_THRESHOLD            4
-#define GCDEXT_THRESHOLD               0
 
 #define DIVREM_1_NORM_THRESHOLD        4
 #define DIVREM_1_UNNORM_THRESHOLD      6
diff --git a/mpn/pa64/gmp-mparam.h b/mpn/pa64/gmp-mparam.h
index 537da5f71..e9d058f6b 100644
--- a/mpn/pa64/gmp-mparam.h
+++ b/mpn/pa64/gmp-mparam.h
@@ -39,10 +39,9 @@ with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                123
 #define POWM_THRESHOLD                  212
 
-#define HGCD_SCHOENHAGE_THRESHOLD       292
+#define HGCD_THRESHOLD                  292
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD       1498
-#define GCDEXT_THRESHOLD                  0  /* always */
+#define GCD_DC_THRESHOLD               1498
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD           0  /* always */
diff --git a/mpn/power/gmp-mparam.h b/mpn/power/gmp-mparam.h
index 8cc6bf0c7..f9b10e6a4 100644
--- a/mpn/power/gmp-mparam.h
+++ b/mpn/power/gmp-mparam.h
@@ -30,10 +30,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 36
 #define POWM_THRESHOLD                   69
 
-#define HGCD_SCHOENHAGE_THRESHOLD        97
+#define HGCD_THRESHOLD                   97
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        590
-#define GCDEXT_THRESHOLD                 41
+#define GCD_DC_THRESHOLD                590
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD          12
diff --git a/mpn/powerpc32/750/gmp-mparam.h b/mpn/powerpc32/750/gmp-mparam.h
index f20fd665f..d604e6ed4 100644
--- a/mpn/powerpc32/750/gmp-mparam.h
+++ b/mpn/powerpc32/750/gmp-mparam.h
@@ -35,10 +35,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 35
 #define POWM_THRESHOLD                   48
 
-#define HGCD_SCHOENHAGE_THRESHOLD        93
+#define HGCD_THRESHOLD                   93
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        676
-#define GCDEXT_THRESHOLD                 31
+#define GCD_DC_THRESHOLD                676
 #define JACOBI_BASE_METHOD                1
 
 #define DIVREM_1_NORM_THRESHOLD           0  /* always */
diff --git a/mpn/powerpc32/gmp-mparam.h b/mpn/powerpc32/gmp-mparam.h
index 0387e2fb7..a77c98e8a 100644
--- a/mpn/powerpc32/gmp-mparam.h
+++ b/mpn/powerpc32/gmp-mparam.h
@@ -41,10 +41,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 45
 #define POWM_THRESHOLD                   89
 
-#define HGCD_SCHOENHAGE_THRESHOLD       145
+#define HGCD_THRESHOLD                  145
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        738
-#define GCDEXT_THRESHOLD                 16
+#define GCD_DC_THRESHOLD                738
 #define JACOBI_BASE_METHOD                1
 
 #define DIVREM_1_NORM_THRESHOLD           0  /* always */
diff --git a/mpn/powerpc64/gmp-mparam.h b/mpn/powerpc64/gmp-mparam.h
index 6fe8a8d40..e0ab478e3 100644
--- a/mpn/powerpc64/gmp-mparam.h
+++ b/mpn/powerpc64/gmp-mparam.h
@@ -37,10 +37,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 28
 #define POWM_THRESHOLD                   40
 
-#define HGCD_SCHOENHAGE_THRESHOLD        56
+#define HGCD_THRESHOLD                   56
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        408
-#define GCDEXT_THRESHOLD                151
+#define GCD_DC_THRESHOLD                408
 #define JACOBI_BASE_METHOD                1
 
 #define MOD_1_NORM_THRESHOLD              0  /* always */
diff --git a/mpn/s390/gmp-mparam.h b/mpn/s390/gmp-mparam.h
index b09191456..d73884667 100644
--- a/mpn/s390/gmp-mparam.h
+++ b/mpn/s390/gmp-mparam.h
@@ -35,7 +35,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define POWM_THRESHOLD                63
 
 #define GCD_ACCEL_THRESHOLD            3
-#define GCDEXT_THRESHOLD              28
 
 #define DIVREM_1_NORM_THRESHOLD        0
 #define DIVREM_1_UNNORM_THRESHOLD      5
diff --git a/mpn/sparc32/gmp-mparam.h b/mpn/sparc32/gmp-mparam.h
index d275da51a..3bc6cd6db 100644
--- a/mpn/sparc32/gmp-mparam.h
+++ b/mpn/sparc32/gmp-mparam.h
@@ -32,7 +32,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define POWM_THRESHOLD                   28
 
 #define GCD_ACCEL_THRESHOLD               3
-#define GCDEXT_THRESHOLD                  0  /* always */
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD           3
diff --git a/mpn/sparc32/v8/gmp-mparam.h b/mpn/sparc32/v8/gmp-mparam.h
index fde006e08..f042c19e5 100644
--- a/mpn/sparc32/v8/gmp-mparam.h
+++ b/mpn/sparc32/v8/gmp-mparam.h
@@ -35,10 +35,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 24
 #define POWM_THRESHOLD                   38
 
-#define HGCD_SCHOENHAGE_THRESHOLD        69
+#define HGCD_THRESHOLD                   69
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        498
-#define GCDEXT_THRESHOLD                  0  /* always */
+#define GCD_DC_THRESHOLD                498
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD           6
diff --git a/mpn/sparc32/v8/supersparc/gmp-mparam.h b/mpn/sparc32/v8/supersparc/gmp-mparam.h
index c6f2d83eb..feb90ef40 100644
--- a/mpn/sparc32/v8/supersparc/gmp-mparam.h
+++ b/mpn/sparc32/v8/supersparc/gmp-mparam.h
@@ -35,10 +35,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 26
 #define POWM_THRESHOLD                   79
 
-#define HGCD_SCHOENHAGE_THRESHOLD        97
+#define HGCD_THRESHOLD                   97
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        470
-#define GCDEXT_THRESHOLD                 14
+#define GCD_DC_THRESHOLD                470
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD           0  /* always */
diff --git a/mpn/sparc32/v9/gmp-mparam.h b/mpn/sparc32/v9/gmp-mparam.h
index 2f11e400e..3d48d743b 100644
--- a/mpn/sparc32/v9/gmp-mparam.h
+++ b/mpn/sparc32/v9/gmp-mparam.h
@@ -34,10 +34,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                125
 #define POWM_THRESHOLD                  150
 
-#define HGCD_SCHOENHAGE_THRESHOLD       210
+#define HGCD_THRESHOLD                  210
 #define GCD_ACCEL_THRESHOLD               4
-#define GCD_SCHOENHAGE_THRESHOLD       1291
-#define GCDEXT_THRESHOLD                  9
+#define GCD_DC_THRESHOLD               1291
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD       MP_SIZE_T_MAX  /* never */
diff --git a/mpn/sparc64/gmp-mparam.h b/mpn/sparc64/gmp-mparam.h
index 4bceda1db..9c59e698f 100644
--- a/mpn/sparc64/gmp-mparam.h
+++ b/mpn/sparc64/gmp-mparam.h
@@ -44,7 +44,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define POWM_THRESHOLD                   85
 
 #define GCD_ACCEL_THRESHOLD               3
-#define GCDEXT_THRESHOLD                 20
 #define JACOBI_BASE_METHOD                2
 
 #define DIVREM_1_NORM_THRESHOLD           3
diff --git a/mpn/vax/gmp-mparam.h b/mpn/vax/gmp-mparam.h
index 4b7a2156d..ea262ddc4 100644
--- a/mpn/vax/gmp-mparam.h
+++ b/mpn/vax/gmp-mparam.h
@@ -32,7 +32,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 /* #define POWM_THRESHOLD                  */
 
 /* #define GCD_ACCEL_THRESHOLD             */
-#define GCDEXT_THRESHOLD                 40
 /* #define JACOBI_BASE_METHOD              */
 
 /* #define DIVREM_1_NORM_THRESHOLD         */
diff --git a/mpn/x86/i486/gmp-mparam.h b/mpn/x86/i486/gmp-mparam.h
index f064a3e69..aaddea9f1 100644
--- a/mpn/x86/i486/gmp-mparam.h
+++ b/mpn/x86/i486/gmp-mparam.h
@@ -37,7 +37,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define POWM_THRESHOLD                   38
 
 #define GCD_ACCEL_THRESHOLD               3
-#define GCDEXT_THRESHOLD                 55
 #define JACOBI_BASE_METHOD                2
 
 #define USE_PREINV_DIVREM_1               0
diff --git a/mpn/x86/k6/gmp-mparam.h b/mpn/x86/k6/gmp-mparam.h
index fc3303880..dbf8c59c8 100644
--- a/mpn/x86/k6/gmp-mparam.h
+++ b/mpn/x86/k6/gmp-mparam.h
@@ -37,10 +37,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 76
 #define POWM_THRESHOLD                   97
 
-#define HGCD_SCHOENHAGE_THRESHOLD       242
+#define HGCD_THRESHOLD                  242
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD       1243
-#define GCDEXT_THRESHOLD                 40
+#define GCD_DC_THRESHOLD               1243
 #define JACOBI_BASE_METHOD                2
 
 #define USE_PREINV_DIVREM_1               0
diff --git a/mpn/x86/k7/gmp-mparam.h b/mpn/x86/k7/gmp-mparam.h
index a3927784d..5c5c1195e 100644
--- a/mpn/x86/k7/gmp-mparam.h
+++ b/mpn/x86/k7/gmp-mparam.h
@@ -41,9 +41,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 84
 #define POWM_THRESHOLD                  134
 
-#define HGCD_SCHOENHAGE_THRESHOLD       220
+#define HGCD_THRESHOLD                  220
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        908
+#define GCD_DC_THRESHOLD                908
 #define GCDEXT_SCHOENHAGE_THRESHOLD     683
 #define JACOBI_BASE_METHOD                1
 
diff --git a/mpn/x86/p6/gmp-mparam.h b/mpn/x86/p6/gmp-mparam.h
index 217facab4..a85c50027 100644
--- a/mpn/x86/p6/gmp-mparam.h
+++ b/mpn/x86/p6/gmp-mparam.h
@@ -45,7 +45,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define POWM_THRESHOLD                  131
 
 #define GCD_ACCEL_THRESHOLD               3
-#define GCDEXT_THRESHOLD                 33
 #define JACOBI_BASE_METHOD                1
 
 #define USE_PREINV_DIVREM_1               0
diff --git a/mpn/x86/p6/mmx/gmp-mparam.h b/mpn/x86/p6/mmx/gmp-mparam.h
index 1456b53a1..c1fa872f0 100644
--- a/mpn/x86/p6/mmx/gmp-mparam.h
+++ b/mpn/x86/p6/mmx/gmp-mparam.h
@@ -54,9 +54,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DC_BDIV_Q_THRESHOLD              10
 #define DIVEXACT_JEB_THRESHOLD           48
 
-#define HGCD_SCHOENHAGE_THRESHOLD       145
+#define HGCD_THRESHOLD                  145
 #define GCD_ACCEL_THRESHOLD               5
-#define GCD_SCHOENHAGE_THRESHOLD        537
+#define GCD_DC_THRESHOLD                537
 #define GCDEXT_SCHOENHAGE_THRESHOLD     948
 #define JACOBI_BASE_METHOD                1
 
diff --git a/mpn/x86/pentium/gmp-mparam.h b/mpn/x86/pentium/gmp-mparam.h
index c7f398da8..5c49c4e3c 100644
--- a/mpn/x86/pentium/gmp-mparam.h
+++ b/mpn/x86/pentium/gmp-mparam.h
@@ -42,10 +42,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 52
 #define POWM_THRESHOLD                   77
 
-#define HGCD_SCHOENHAGE_THRESHOLD       121
+#define HGCD_THRESHOLD                  121
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        615
-#define GCDEXT_THRESHOLD                 13
+#define GCD_DC_THRESHOLD                615
 #define JACOBI_BASE_METHOD                2
 
 #define USE_PREINV_DIVREM_1               0
diff --git a/mpn/x86/pentium/mmx/gmp-mparam.h b/mpn/x86/pentium/mmx/gmp-mparam.h
index 40eaecd6f..aae5fec48 100644
--- a/mpn/x86/pentium/mmx/gmp-mparam.h
+++ b/mpn/x86/pentium/mmx/gmp-mparam.h
@@ -42,9 +42,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DIV_DC_THRESHOLD                 37
 #define POWM_THRESHOLD                   73
 
-#define HGCD_SCHOENHAGE_THRESHOLD        97
+#define HGCD_THRESHOLD                   97
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        849
+#define GCD_DC_THRESHOLD                849
 #define GCDEXT_THRESHOLD                 14
 #define JACOBI_BASE_METHOD                2
 
diff --git a/mpn/x86/pentium4/sse2/gmp-mparam.h b/mpn/x86/pentium4/sse2/gmp-mparam.h
index 113356dcc..3ad7a93a1 100644
--- a/mpn/x86/pentium4/sse2/gmp-mparam.h
+++ b/mpn/x86/pentium4/sse2/gmp-mparam.h
@@ -48,9 +48,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DC_BDIV_Q_THRESHOLD              10
 #define DIVEXACT_JEB_THRESHOLD           80
 
-#define HGCD_SCHOENHAGE_THRESHOLD       101
+#define HGCD_THRESHOLD                  101
 #define GCD_ACCEL_THRESHOLD               6
-#define GCD_SCHOENHAGE_THRESHOLD        341
+#define GCD_DC_THRESHOLD                341
 #define GCDEXT_SCHOENHAGE_THRESHOLD     375
 #define JACOBI_BASE_METHOD                1
 
diff --git a/mpn/x86_64/core2/gmp-mparam.h b/mpn/x86_64/core2/gmp-mparam.h
index e4a4ea2e8..44e3af47d 100644
--- a/mpn/x86_64/core2/gmp-mparam.h
+++ b/mpn/x86_64/core2/gmp-mparam.h
@@ -49,10 +49,10 @@ MA 02110-1301, USA. */
 #define DC_BDIV_Q_THRESHOLD              10
 #define DIVEXACT_JEB_THRESHOLD           40
 
-#define HGCD_SCHOENHAGE_THRESHOLD       191
-#define GCD_ACCEL_THRESHOLD               5
-#define GCD_SCHOENHAGE_THRESHOLD        948
-#define GCDEXT_SCHOENHAGE_THRESHOLD     254
+#define MATRIX22_STRASSEN_THRESHOLD      25
+#define HGCD_THRESHOLD                  191
+#define GCD_DC_THRESHOLD                948
+#define GCDEXT_DC_THRESHOLD             254
 #define JACOBI_BASE_METHOD                1
 
 #define MOD_1_NORM_THRESHOLD              0  /* always */
diff --git a/mpn/x86_64/gmp-mparam.h b/mpn/x86_64/gmp-mparam.h
index 3c3d94315..fc2cd275c 100644
--- a/mpn/x86_64/gmp-mparam.h
+++ b/mpn/x86_64/gmp-mparam.h
@@ -47,10 +47,11 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DC_BDIV_Q_THRESHOLD              10
 #define DIVEXACT_JEB_THRESHOLD           50
 
-#define HGCD_SCHOENHAGE_THRESHOLD       145
+#define MATRIX22_STRASSEN_THRESHOLD      22
+#define HGCD_THRESHOLD                  111
 #define GCD_ACCEL_THRESHOLD               3
-#define GCD_SCHOENHAGE_THRESHOLD        445
-#define GCDEXT_SCHOENHAGE_THRESHOLD     713
+#define GCD_DC_THRESHOLD                412
+#define GCDEXT_DC_THRESHOLD             390
 #define JACOBI_BASE_METHOD                1
 
 #define MOD_1_NORM_THRESHOLD              0  /* always */
diff --git a/mpn/x86_64/pentium4/gmp-mparam.h b/mpn/x86_64/pentium4/gmp-mparam.h
index e1c56bcac..afb106f59 100644
--- a/mpn/x86_64/pentium4/gmp-mparam.h
+++ b/mpn/x86_64/pentium4/gmp-mparam.h
@@ -54,9 +54,9 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #define DC_BDIV_Q_THRESHOLD              10
 #define DIVEXACT_JEB_THRESHOLD           27
 
-#define HGCD_SCHOENHAGE_THRESHOLD       133
+#define HGCD_THRESHOLD                  133
 #define GCD_ACCEL_THRESHOLD              10
-#define GCD_SCHOENHAGE_THRESHOLD        792
+#define GCD_DC_THRESHOLD                792
 #define GCDEXT_SCHOENHAGE_THRESHOLD     339
 #define JACOBI_BASE_METHOD                1
 
diff --git a/tests/mpn/Makefile.am b/tests/mpn/Makefile.am
index decce7182..f67138a6c 100644
--- a/tests/mpn/Makefile.am
+++ b/tests/mpn/Makefile.am
@@ -22,7 +22,7 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/tests
 LDADD = $(top_builddir)/tests/libtests.la $(top_builddir)/libgmp.la
 
 check_PROGRAMS = t-asmtype t-aors_1 t-divrem_1 t-fat t-get_d \
-  t-instrument t-iord_u t-mp_bases t-perfsqr t-scan t-hgcd
+  t-instrument t-iord_u t-mp_bases t-perfsqr t-scan t-hgcd t-matrix22
 
 TESTS = $(check_PROGRAMS)
 
diff --git a/tests/mpn/t-hgcd.c b/tests/mpn/t-hgcd.c
index 94d4ca95a..2615fd679 100644
--- a/tests/mpn/t-hgcd.c
+++ b/tests/mpn/t-hgcd.c
@@ -25,7 +25,7 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #include "gmp-impl.h"
 #include "tests.h"
 
-static int one_test __GMP_PROTO ((mpz_t, mpz_t, int));
+static mp_size_t one_test __GMP_PROTO ((mpz_t, mpz_t, int));
 static void debug_mp __GMP_PROTO ((mpz_t, int));
 
 #define MIN_OPERAND_SIZE 2
@@ -34,31 +34,26 @@ static void debug_mp __GMP_PROTO ((mpz_t, int));
 struct value { int res; const char *a; const char *b; };
 static const struct value hgcd_values[] = {
 #if GMP_NUMB_BITS == 32
-  { 4,
+  { 5,
     "0x1bddff867272a9296ac493c251d7f46f09a5591fe",
     "0xb55930a2a68a916450a7de006031068c5ddb0e5c" },
   { 4,
     "0x2f0ece5b1ee9c15e132a01d55768dc13",
     "0x1c6f4fd9873cdb24466e6d03e1cc66e7" },
-  { 4, "0x7FFFFC003FFFFFFFFFC5", "0x3FFFFE001FFFFFFFFFE3"},
+  { 3, "0x7FFFFC003FFFFFFFFFC5", "0x3FFFFE001FFFFFFFFFE3"},
 #endif
   { -1, NULL, NULL }
 };
 
 struct hgcd_ref
 {
-  /* Sign here, u and v are stored as absolute values */
-  int sign;
-
-  mpz_t r[4];
-  mpz_t u[4];
-  mpz_t v[4];
+  mpz_t m[2][2];
 };
 
 static void hgcd_ref_init __GMP_PROTO ((struct hgcd_ref *hgcd));
 static void hgcd_ref_clear __GMP_PROTO ((struct hgcd_ref *hgcd));
-static int hgcd_ref __GMP_PROTO ((struct hgcd_ref *hgcd, const mpz_t a, const mpz_t b));
-static int hgcd_ref_equal __GMP_PROTO ((const struct hgcd *hgcd, const struct hgcd_ref *ref));
+static int hgcd_ref __GMP_PROTO ((struct hgcd_ref *hgcd, mpz_t a, mpz_t b));
+static int hgcd_ref_equal __GMP_PROTO ((const struct hgcd_matrix *hgcd, const struct hgcd_ref *ref));
 
 int
 main (int argc, char **argv)
@@ -80,7 +75,7 @@ main (int argc, char **argv)
 
   for (i = 0; hgcd_values[i].res >= 0; i++)
     {
-      int res;
+      mp_size_t res;
 
       mpz_set_str (op1, hgcd_values[i].a, 0);
       mpz_set_str (op2, hgcd_values[i].b, 0);
@@ -117,7 +112,7 @@ main (int argc, char **argv)
       if (mpz_cmp (op1, op2) < 0)
 	mpz_swap (op1, op2);
 
-      if (mpz_size(op1) > 0)
+      if (mpz_size (op1) > 0)
 	one_test (op1, op2, i);
 
       /* Generate a division chain backwards, allowing otherwise
@@ -133,7 +128,7 @@ main (int argc, char **argv)
       chain_len = 1000000;
 #else
       mpz_urandomb (bs, rands, 32);
-      chain_len = mpz_get_ui (bs) % (GMP_NUMB_BITS * GCD_SCHOENHAGE_THRESHOLD / 256);
+      chain_len = mpz_get_ui (bs) % (GMP_NUMB_BITS * GCD_DC_THRESHOLD / 256);
 #endif
 
       for (j = 0; j < chain_len; j++)
@@ -146,7 +141,7 @@ main (int argc, char **argv)
 	  mpz_add (op1, op1, temp1);
 
 	  /* Don't generate overly huge operands.  */
-	  if (SIZ (op1) > 3 * GCD_SCHOENHAGE_THRESHOLD)
+	  if (SIZ (op1) > 3 * GCD_DC_THRESHOLD)
 	    break;
 
 	  mpz_urandomb (bs, rands, 32);
@@ -157,13 +152,13 @@ main (int argc, char **argv)
 	  mpz_add (op2, op2, temp1);
 
 	  /* Don't generate overly huge operands.  */
-	  if (SIZ (op2) > 3 * GCD_SCHOENHAGE_THRESHOLD)
+	  if (SIZ (op2) > 3 * GCD_DC_THRESHOLD)
 	    break;
 	}
       if (mpz_cmp (op1, op2) < 0)
 	mpz_swap (op1, op2);
 
-      if (mpz_size(op1) > 0)
+      if (mpz_size (op1) > 0)
 	one_test (op1, op2, i);
     }
 
@@ -177,33 +172,37 @@ debug_mp (mpz_t x, int base)
 }
 
 static int
+mpz_mpn_equal (const mpz_t a, mp_srcptr bp, mp_size_t bsize);
+
+static mp_size_t
 one_test (mpz_t a, mpz_t b, int i)
 {
-  struct hgcd hgcd;
+  struct hgcd_matrix hgcd;
   struct hgcd_ref ref;
-  struct qstack quotients;
-  int res[2];
+
+  mpz_t ref_r0;
+  mpz_t ref_r1;
+  mpz_t hgcd_r0;
+  mpz_t hgcd_r1;
+
+  mp_size_t res[2];
   mp_size_t asize;
   mp_size_t bsize;
 
   mp_size_t hgcd_init_scratch;
-  mp_size_t qstack_scratch;
   mp_size_t hgcd_scratch;
 
   mp_ptr hgcd_init_tp;
-  mp_ptr qstack_tp;
   mp_ptr hgcd_tp;
 
   asize = a->_mp_size;
   bsize = b->_mp_size;
 
-  hgcd_init_scratch = mpn_hgcd_init_itch (asize);
-  hgcd_init_tp = refmpn_malloc_limbs (hgcd_init_scratch);
-  mpn_hgcd_init (&hgcd, asize, hgcd_init_tp);
+  ASSERT (asize >= bsize);
 
-  qstack_scratch = qstack_itch (asize);
-  qstack_tp = refmpn_malloc_limbs (qstack_scratch);
-  qstack_init (&quotients, asize, qstack_tp, qstack_scratch);
+  hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (asize);
+  hgcd_init_tp = refmpn_malloc_limbs (hgcd_init_scratch);
+  mpn_hgcd_matrix_init (&hgcd, asize, hgcd_init_tp);
 
   hgcd_scratch = mpn_hgcd_itch (asize);
   hgcd_tp = refmpn_malloc_limbs (hgcd_scratch);
@@ -221,28 +220,37 @@ one_test (mpz_t a, mpz_t b, int i)
 #endif
   hgcd_ref_init (&ref);
 
-  res[0] = hgcd_ref (&ref, a, b);
-  res[1] = mpn_hgcd (&hgcd,
-		     a->_mp_d, asize,
-		     b->_mp_d, bsize,
-		     &quotients,
-		     hgcd_tp, hgcd_scratch);
+  mpz_init_set (ref_r0, a);
+  mpz_init_set (ref_r1, b);
+  res[0] = hgcd_ref (&ref, ref_r0, ref_r1);
+
+  mpz_init_set (hgcd_r0, a);
+  mpz_init_set (hgcd_r1, b);
+  if (bsize < asize)
+    {
+      _mpz_realloc (hgcd_r1, asize);
+      MPN_ZERO (hgcd_r1->_mp_d + bsize, asize - bsize);
+    }
+  res[1] = mpn_hgcd (hgcd_r0->_mp_d,
+		     hgcd_r1->_mp_d,
+		     asize,
+		     &hgcd, hgcd_tp);
 
   if (res[0] != res[1])
     {
       fprintf (stderr, "ERROR in test %d\n", i);
-      fprintf (stderr, "Different return code from hgcd and hgcd_ref\n");
+      fprintf (stderr, "Different return value from hgcd and hgcd_ref\n");
       fprintf (stderr, "op1=");                 debug_mp (a, -16);
       fprintf (stderr, "op2=");                 debug_mp (b, -16);
-      fprintf (stderr, "hgcd_ref: %d\n", res[0]);
-      fprintf (stderr, "mpn_hgcd: %d\n", res[1]);
+      fprintf (stderr, "hgcd_ref: %ld\n", (long) res[0]);
+      fprintf (stderr, "mpn_hgcd: %ld\n", (long) res[1]);
       abort ();
     }
   if (res[0] > 0)
     {
-      ASSERT_HGCD (&hgcd, a->_mp_d, asize, b->_mp_d, bsize, 0, 4);
-
-      if (!hgcd_ref_equal (&hgcd, &ref))
+      if (!hgcd_ref_equal (&hgcd, &ref)
+	  || !mpz_mpn_equal (ref_r0, hgcd_r0->_mp_d, res[1])
+	  || !mpz_mpn_equal (ref_r1, hgcd_r1->_mp_d, res[1]))
 	{
 	  fprintf (stderr, "ERROR in test %d\n", i);
 	  fprintf (stderr, "mpn_hgcd and hgcd_ref returned different values\n");
@@ -253,9 +261,12 @@ one_test (mpz_t a, mpz_t b, int i)
     }
 
   refmpn_free_limbs (hgcd_init_tp);
-  refmpn_free_limbs (qstack_tp);
   refmpn_free_limbs (hgcd_tp);
   hgcd_ref_clear (&ref);
+  mpz_clear (ref_r0);
+  mpz_clear (ref_r1);
+  mpz_clear (hgcd_r0);
+  mpz_clear (hgcd_r1);
 
   return res[0];
 }
@@ -264,11 +275,11 @@ static void
 hgcd_ref_init (struct hgcd_ref *hgcd)
 {
   unsigned i;
-  for (i = 0; i<4; i++)
+  for (i = 0; i<2; i++)
     {
-      mpz_init (hgcd->r[i]);
-      mpz_init (hgcd->u[i]);
-      mpz_init (hgcd->v[i]);
+      unsigned j;
+      for (j = 0; j<2; j++)
+	mpz_init (hgcd->m[i][j]);
     }
 }
 
@@ -276,137 +287,91 @@ static void
 hgcd_ref_clear (struct hgcd_ref *hgcd)
 {
   unsigned i;
-  for (i = 0; i<4; i++)
+  for (i = 0; i<2; i++)
     {
-      mpz_clear (hgcd->r[i]);
-      mpz_clear (hgcd->u[i]);
-      mpz_clear (hgcd->v[i]);
+      unsigned j;
+      for (j = 0; j<2; j++)
+	mpz_clear (hgcd->m[i][j]);
     }
 }
 
+
 static int
-hgcd_ref (struct hgcd_ref *hgcd, const mpz_t a, const mpz_t b)
+sdiv_qr (mpz_t q, mpz_t r, mp_size_t s, const mpz_t a, const mpz_t b)
 {
-  mp_size_t M = (a->_mp_size + 1) / 2;
-  mpz_t t;
+  mpz_fdiv_qr (q, r, a, b);
+  if (mpz_size (r) <= s)
+    {
+      mpz_add (r, r, b);
+      mpz_sub_ui (q, q, 1);
+    }
+
+  return (mpz_sgn (q) > 0);
+}
+
+static int
+hgcd_ref (struct hgcd_ref *hgcd, mpz_t a, mpz_t b)
+{
+  mp_size_t n = MAX (mpz_size (a), mpz_size (b));
+  mp_size_t s = n/2 + 1;
+  mp_size_t asize;
+  mp_size_t bsize;
   mpz_t q;
   int res;
 
-  if (mpz_size(b) <= M)
+  if (mpz_size (a) <= s || mpz_size (b) <= s)
     return 0;
 
-  mpz_init (q);
-  mpz_fdiv_qr(q, hgcd->r[2], a, b);
-
-  if (mpz_size (hgcd->r[2]) <= M)
+  res = mpz_cmp (a, b);
+  if (res < 0)
     {
-      mpz_clear (q);
-      return 0;
-    }
-
-  mpz_set (hgcd->r[0], a); mpz_set (hgcd->r[1], b);
+      mpz_sub (b, b, a);
+      if (mpz_size (b) <= s)
+	return 0;
 
-  mpz_set_ui (hgcd->u[0], 1); mpz_set_ui (hgcd->v[0], 0);
-  mpz_set_ui (hgcd->u[1], 0); mpz_set_ui (hgcd->v[1], 1);
-  mpz_set_ui (hgcd->u[2], 1); mpz_set    (hgcd->v[2], q);
+      mpz_set_ui (hgcd->m[0][0], 1); mpz_set_ui (hgcd->m[0][1], 0);
+      mpz_set_ui (hgcd->m[1][0], 1); mpz_set_ui (hgcd->m[1][1], 1);
+    }
+  else if (res > 0)
+    {
+      mpz_sub (a, a, b);
+      if (mpz_size (a) <= s)
+	return 0;
 
-  hgcd->sign = 0;
+      mpz_set_ui (hgcd->m[0][0], 1); mpz_set_ui (hgcd->m[0][1], 1);
+      mpz_set_ui (hgcd->m[1][0], 0); mpz_set_ui (hgcd->m[1][1], 1);
+    }
+  else
+    return 0;
 
-  mpz_init (t);
+  mpz_init (q);
 
   for (;;)
     {
-      mpz_fdiv_qr(q, hgcd->r[3], hgcd->r[1], hgcd->r[2]);
+      ASSERT (mpz_size (a) > s);
+      ASSERT (mpz_size (b) > s);
 
-      mpz_mul (hgcd->u[3], q, hgcd->u[2]);
-      mpz_add (hgcd->u[3], hgcd->u[3], hgcd->u[1]);
-
-      mpz_mul (hgcd->v[3], q, hgcd->v[2]);
-      mpz_add (hgcd->v[3], hgcd->v[3], hgcd->v[1]);
-
-      if (mpz_size (hgcd->r[3]) <= M)
+      if (mpz_cmp (a, b) > 0)
 	{
-#if 0
-	  unsigned i;
-	  printf("hgcd_ref: sign = %d\n", hgcd->sign);
-	  for (i = 0; i < 4; i++)
-	    gmp_printf("r = %Zd, u = %Zd, v = %Zd\n",
-		       hgcd->r[i], hgcd->u[i], hgcd->v[i]);
-#endif
-	  /* Check Jebelean's criterion */
-
-	  if (hgcd->sign >= 0)
-	    {
-	      /* Check if r1 - r2 >= u2 - u1 */
-	      mpz_add (t, hgcd->u[2], hgcd->u[1]);
-	    }
-	  else
-	    {
-	      /* Check if r1 - r2 >= v2 - v1 */
-	      mpz_add (t, hgcd->v[2], hgcd->v[1]);
-	    }
-
-	  /* Check r1 >= t + r2 */
-	  mpz_add (t, t, hgcd->r[2]);
-	  if (mpz_cmp (hgcd->r[1], t) < 0)
-	    {
-	      res = 2; break;
-	    }
-
-	  /* Now r2 is correct */
-	  if (hgcd->sign >= 0)
-	    {
-	      /* Check r3 >= max (-u3, -v3) = u3 */
-	      if (mpz_cmp (hgcd->r[3], hgcd->u[3]) < 0)
-		{
-		  res = 3; break;
-		}
-
-	      /* Check r3 - r2 >= v3 - v2 */
-	      mpz_add (t, hgcd->v[3], hgcd->v[2]);
-	    }
-	  else
-	    {
-	      /* Check r3 >= max (-u3, -v3) = v3 */
-	      if (mpz_cmp (hgcd->r[3], hgcd->v[3]) < 0)
-		{
-		  res = 3; break;
-		}
-
-	      /* Check r3 - r2 >= u3 - u2 */
-	      mpz_add (t, hgcd->u[3], hgcd->u[2]);
-	    }
-
-	  /* Check r2 >= t + r3 */
-	  mpz_add (t, t, hgcd->r[3]);
-	  if (mpz_cmp (hgcd->r[2], t) < 0)
-	    {
-	      res = 3; break;
-	    }
-
-	  /* Now r3 is correct */
-	  res = 4; break;
+	  if (!sdiv_qr (q, a, s, a, b))
+	    break;
+	  mpz_addmul (hgcd->m[0][1], q, hgcd->m[0][0]);
+	  mpz_addmul (hgcd->m[1][1], q, hgcd->m[1][0]);
+	}
+      else
+	{
+	  if (!sdiv_qr (q, b, s, b, a))
+	    break;
+	  mpz_addmul (hgcd->m[0][0], q, hgcd->m[0][1]);
+	  mpz_addmul (hgcd->m[1][0], q, hgcd->m[1][1]);
 	}
-
-      /* Shift rows */
-      hgcd->sign = ~hgcd->sign;
-      mpz_swap (hgcd->r[0], hgcd->r[1]);
-      mpz_swap (hgcd->r[1], hgcd->r[2]);
-      mpz_swap (hgcd->r[2], hgcd->r[3]);
-
-      mpz_swap (hgcd->u[0], hgcd->u[1]);
-      mpz_swap (hgcd->u[1], hgcd->u[2]);
-      mpz_swap (hgcd->u[2], hgcd->u[3]);
-
-      mpz_swap (hgcd->v[0], hgcd->v[1]);
-      mpz_swap (hgcd->v[1], hgcd->v[2]);
-      mpz_swap (hgcd->v[2], hgcd->v[3]);
     }
 
-  mpz_clear (t);
   mpz_clear (q);
 
-  return res;
+  asize = mpz_size (a);
+  bsize = mpz_size (b);
+  return MAX (asize, bsize);
 }
 
 static int
@@ -416,25 +381,22 @@ mpz_mpn_equal (const mpz_t a, mp_srcptr bp, mp_size_t bsize)
   mp_size_t asize = a->_mp_size;
 
   MPN_NORMALIZE (bp, bsize);
-  return asize == bsize && mpn_cmp(ap, bp, asize) == 0;
+  return asize == bsize && mpn_cmp (ap, bp, asize) == 0;
 }
 
 static int
-hgcd_ref_equal (const struct hgcd *hgcd, const struct hgcd_ref *ref)
+hgcd_ref_equal (const struct hgcd_matrix *hgcd, const struct hgcd_ref *ref)
 {
   unsigned i;
 
-  if (ref->sign != hgcd->sign)
-    return 0;
-
-  for (i = 0; i<4; i++)
+  for (i = 0; i<2; i++)
     {
-      if (!mpz_mpn_equal (ref->r[i], hgcd->row[i].rp, hgcd->row[i].rsize))
-	return 0;
-      if (!mpz_mpn_equal (ref->u[i], hgcd->row[i].uvp[0], hgcd->size))
-	return 0;
-      if (!mpz_mpn_equal (ref->v[i], hgcd->row[i].uvp[1], hgcd->size))
-	return 0;
+      unsigned j;
+
+      for (j = 0; j<2; j++)
+	if (!mpz_mpn_equal (ref->m[i][j], hgcd->p[i][j], hgcd->n))
+	  return 0;
     }
+
   return 1;
 }
diff --git a/tests/mpn/t-matrix22.c b/tests/mpn/t-matrix22.c
new file mode 100644
index 000000000..17d1dc614
--- /dev/null
+++ b/tests/mpn/t-matrix22.c
@@ -0,0 +1,207 @@
+/* Tests matrix22_mul.
+
+Copyright 2008 Free
+Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "tests.h"
+
+struct matrix {
+  mp_size_t alloc;
+  mp_size_t n;
+  mp_ptr e00, e01, e10, e11;
+};
+
+static void
+matrix_init (struct matrix *M, mp_size_t n)
+{  
+  mp_ptr p = refmpn_malloc_limbs (4*(n+1));
+  M->e00 = p; p += n+1;
+  M->e01 = p; p += n+1;
+  M->e10 = p; p += n+1;
+  M->e11 = p;
+  M->alloc = n + 1;
+  M->n = 0;
+}
+
+static void
+matrix_clear (struct matrix *M)
+{
+  refmpn_free_limbs (M->e00);
+}
+
+static void
+matrix_copy (struct matrix *R, const struct matrix *M)
+{
+  R->n = M->n;
+  MPN_COPY (R->e00, M->e00, M->n);
+  MPN_COPY (R->e01, M->e01, M->n);
+  MPN_COPY (R->e10, M->e10, M->n);
+  MPN_COPY (R->e11, M->e11, M->n);
+}
+
+/* Used with same size, so no need for normalization. */
+static int
+matrix_equal_p (const struct matrix *A, const struct matrix *B)
+{
+  return (A->n == B->n
+	  && mpn_cmp (A->e00, B->e00, A->n) == 0
+	  && mpn_cmp (A->e01, B->e01, A->n) == 0
+	  && mpn_cmp (A->e10, B->e10, A->n) == 0
+	  && mpn_cmp (A->e11, B->e11, A->n) == 0);
+}
+
+static void
+matrix_random(struct matrix *M, mp_size_t n, gmp_randstate_ptr rands)
+{
+  M->n = n;
+  mpn_random (M->e00, n);
+  mpn_random (M->e01, n);
+  mpn_random (M->e10, n);
+  mpn_random (M->e11, n);  
+}
+
+#define MUL(rp, ap, an, bp, bn) do { \
+    if (an > bn)		     \
+      mpn_mul (rp, ap, an, bp, bn);  \
+    else			     \
+      mpn_mul (rp, bp, bn, ap, an);  \
+  } while(0)
+
+static void
+ref_matrix22_mul (struct matrix *R,
+		  const struct matrix *A,
+		  const struct matrix *B, mp_ptr tp)
+{
+  mp_size_t an, bn, n;
+  mp_ptr r00, r01, r10, r11, a00, a01, a10, a11, b00, b01, b10, b11;
+  
+  if (A->n >= B->n)
+    {
+      r00 = R->e00; a00 = A->e00; b00 = B->e00;
+      r01 = R->e01; a01 = A->e01; b01 = B->e01;
+      r10 = R->e10; a10 = A->e10; b10 = B->e10;
+      r11 = R->e11; a11 = A->e11; b11 = B->e11;
+      an = A->n, bn = B->n;
+    }
+  else
+    {
+      /* Transpose */
+      r00 = R->e00; a00 = B->e00; b00 = A->e00;
+      r01 = R->e10; a01 = B->e10; b01 = A->e10;
+      r10 = R->e01; a10 = B->e01; b10 = A->e01;
+      r11 = R->e11; a11 = B->e11; b11 = A->e11;      
+      an = B->n, bn = A->n;
+    }
+  n = an + bn;
+  R->n = n + 1;
+  
+  mpn_mul (r00, a00, an, b00, bn);
+  mpn_mul (tp, a01, an, b10, bn);
+  r00[n] = mpn_add_n (r00, r00, tp, n);
+
+  mpn_mul (r01, a00, an, b01, bn);
+  mpn_mul (tp, a01, an, b11, bn);
+  r01[n] = mpn_add_n (r01, r01, tp, n);
+
+  mpn_mul (r10, a10, an, b00, bn);
+  mpn_mul (tp, a11, an, b10, bn);
+  r10[n] = mpn_add_n (r10, r10, tp, n);
+
+  mpn_mul (r11, a10, an, b01, bn);
+  mpn_mul (tp, a11, an, b11, bn);
+  r11[n] = mpn_add_n (r11, r11, tp, n);
+}
+
+static void
+one_test (const struct matrix *A, const struct matrix *B, int i)
+{
+  struct matrix R;
+  struct matrix P;
+  mp_ptr tp;
+
+  matrix_init (&R, A->n + B->n + 1);
+  matrix_init (&P, A->n + B->n + 1);
+
+  tp = refmpn_malloc_limbs (mpn_matrix22_mul_itch (A->n, B->n));
+			    
+  ref_matrix22_mul (&R, A, B, tp);
+  matrix_copy (&P, A);
+  mpn_matrix22_mul (P.e00, P.e01, P.e10, P.e11, A->n,
+		    B->e00, B->e01, B->e10, B->e11, B->n, tp);
+  P.n = A->n + B->n + 1;
+  if (!matrix_equal_p (&R, &P))
+    {
+      fprintf (stderr, "ERROR in test %d\n", i);
+      gmp_fprintf (stderr, "A = (%Nx, %Nx\n      %Nx, %Nx)\n"
+		   "B = (%Nx, %Nx\n      %Nx, %Nx)\n"
+		   "R = (%Nx, %Nx (expected)\n      %Nx, %Nx)\n"
+		   "P = (%Nx, %Nx (incorrect)\n      %Nx, %Nx)\n",
+		   A->e00, A->n, A->e01, A->n, A->e10, A->n, A->e11, A->n, 
+		   B->e00, B->n, B->e01, B->n, B->e10, B->n, B->e11, B->n, 
+		   R.e00, R.n, R.e01, R.n, R.e10, R.n, R.e11, R.n, 
+		   P.e00, P.n, P.e01, P.n, P.e10, P.n, P.e11, P.n);
+      abort();
+    }
+  refmpn_free_limbs (tp);
+  matrix_clear (&R);
+  matrix_clear (&P);
+}
+
+#define MAX_SIZE (2+2*MATRIX22_STRASSEN_THRESHOLD)
+
+int
+main (int argc, char **argv)
+{
+  struct matrix A;
+  struct matrix B;
+
+  gmp_randstate_ptr rands;
+  mpz_t bs;
+  int i;
+
+  tests_start ();
+  rands = RANDS;
+
+  matrix_init (&A, MAX_SIZE);
+  matrix_init (&B, MAX_SIZE);
+  mpz_init (bs);
+
+  for (i = 0; i < 17; i++)
+    {
+      mp_size_t an, bn;
+      mpz_urandomb (bs, rands, 32);
+      an = 1 + mpz_get_ui (bs) % MAX_SIZE;
+      mpz_urandomb (bs, rands, 32);
+      bn = 1 + mpz_get_ui (bs) % MAX_SIZE;
+
+      matrix_random (&A, an, rands);
+      matrix_random (&B, bn, rands);
+
+      one_test (&A, &B, i);
+    }
+  mpz_clear (bs);
+  matrix_clear (&A);
+  matrix_clear (&B);
+
+  return 0;
+}
diff --git a/tests/mpz/t-gcd.c b/tests/mpz/t-gcd.c
index 13065bdab..a58832861 100644
--- a/tests/mpz/t-gcd.c
+++ b/tests/mpz/t-gcd.c
@@ -82,10 +82,10 @@ check_data (void)
    to reinitialize them for each test.  */
 mpz_t gcd1, gcd2, s, t, temp1, temp2;
 
-#if GCD_SCHOENHAGE_THRESHOLD > GCDEXT_SCHOENHAGE_THRESHOLD
-#define MAX_SCHOENHAGE_THRESHOLD GCD_SCHOENHAGE_THRESHOLD
+#if GCD_DC_THRESHOLD > GCDEXT_DC_THRESHOLD
+#define MAX_SCHOENHAGE_THRESHOLD GCD_DC_THRESHOLD
 #else
-#define MAX_SCHOENHAGE_THRESHOLD GCDEXT_SCHOENHAGE_THRESHOLD
+#define MAX_SCHOENHAGE_THRESHOLD GCDEXT_DC_THRESHOLD
 #endif
 
 /* Define this to make all operands be large enough for Schoenhage gcd
@@ -252,6 +252,7 @@ one_test (mpz_t op1, mpz_t op2, mpz_t ref, int i)
       fprintf (stderr, "op1=");                 debug_mp (op1, -16);
       fprintf (stderr, "op2=");                 debug_mp (op2, -16);
       fprintf (stderr, "mpz_gcdext returns:\n");debug_mp (gcd1, -16);
+      fprintf (stderr, "s=");                   debug_mp (s, -16);
       abort ();
     }
 
diff --git a/tune/Makefile.am b/tune/Makefile.am
index 8748cbc4d..96d90ae77 100644
--- a/tune/Makefile.am
+++ b/tune/Makefile.am
@@ -41,7 +41,7 @@ EXTRA_LTLIBRARIES = libspeed.la
 
 libspeed_la_SOURCES =							\
   common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c		\
-  freq.c gcd_bin.c gcd_accel.c gcd_finda_gen.c					\
+  freq.c					\
   gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c			\
   jacbase1.c jacbase2.c jacbase3.c					\
   mod_1_div.c mod_1_inv.c modlinv.c					\
@@ -124,7 +124,7 @@ DISTCLEANFILES = sqr_basecase.c  $(MANY_DISTCLEAN)
 
 TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c
 TUNE_MPN_SRCS_BASIC = dc_divrem_n.c divrem_2.c gcd.c gcdext.c get_str.c \
-  set_str.c hgcd.c mul_n.c mullow_n.c mul_fft.c mul.c sb_divrem_mn.c tdiv_qr.c
+  set_str.c matrix22_mul.c hgcd.c mul_n.c mullow_n.c mul_fft.c mul.c sb_divrem_mn.c tdiv_qr.c
 
 $(TUNE_MPN_SRCS_BASIC):
 	for i in $(TUNE_MPN_SRCS_BASIC); do \
diff --git a/tune/common.c b/tune/common.c
index c7b9b4e61..9efd4f85a 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -999,18 +999,71 @@ speed_mpn_mullow_basecase (struct speed_params *s)
 }
 
 double
+speed_mpn_matrix22_mul (struct speed_params *s)
+{
+  /* Speed params only includes 2 inputs, so we have to invent the
+     other 6. */
+
+  mp_ptr a1, a2, a3;
+  mp_ptr r0, r1, r2, r3;
+  mp_ptr b1, b2, b3;
+  mp_ptr tp;
+  mp_size_t scratch;
+  unsigned i;
+  double t;
+  TMP_DECL;
+
+  TMP_MARK;
+  SPEED_TMP_ALLOC_LIMBS (a1, s->size, s->align_xp);
+  SPEED_TMP_ALLOC_LIMBS (a2, s->size, s->align_xp);
+  SPEED_TMP_ALLOC_LIMBS (a3, s->size, s->align_xp);
+
+  SPEED_TMP_ALLOC_LIMBS (b1, s->size, s->align_yp);
+  SPEED_TMP_ALLOC_LIMBS (b2, s->size, s->align_yp);
+  SPEED_TMP_ALLOC_LIMBS (b3, s->size, s->align_yp);
+
+  SPEED_TMP_ALLOC_LIMBS (r0, 2 * s->size +1, s->align_xp);
+  SPEED_TMP_ALLOC_LIMBS (r1, 2 * s->size +1, s->align_xp);
+  SPEED_TMP_ALLOC_LIMBS (r2, 2 * s->size +1, s->align_xp);
+  SPEED_TMP_ALLOC_LIMBS (r3, 2 * s->size +1, s->align_xp);
+
+  mpn_random (a1, s->size);
+  mpn_random (a2, s->size);
+  mpn_random (a3, s->size);
+  mpn_random (b1, s->size);
+  mpn_random (b2, s->size);
+  mpn_random (b3, s->size);
+
+  scratch = mpn_matrix22_mul_itch (s->size, s->size);
+  SPEED_TMP_ALLOC_LIMBS (tp, scratch, s->align_wp);
+
+  speed_starttime ();
+  i = s->reps;
+  do
+    {
+      MPN_COPY (r0, s->xp, s->size); 
+      MPN_COPY (r1, a1, s->size); 
+      MPN_COPY (r2, a2, s->size); 
+      MPN_COPY (r3, a3, s->size);
+      mpn_matrix22_mul (r0, r1, r2, r3, s->size, s->yp, b1, b2, b3, s->size, tp);
+    }
+  while (--i != 0);
+  t = speed_endtime();
+  TMP_FREE;
+  return t;
+}
+
+double
 speed_mpn_hgcd (struct speed_params *s)
 {
   mp_ptr wp;
-  mp_size_t hgcd_init_scratch = mpn_hgcd_init_itch (s->size);
-  mp_size_t qstack_scratch = qstack_itch (s->size);
+  mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);
   mp_size_t hgcd_scratch = mpn_hgcd_itch (s->size);
   mp_ptr ap;
   mp_ptr bp;
   mp_ptr tmp1, tmp2;
 
-  struct hgcd hgcd;
-  struct qstack quotients;
+  struct hgcd_matrix hgcd;
   int res;
   unsigned i;
   double t;
@@ -1024,53 +1077,38 @@ speed_mpn_hgcd (struct speed_params *s)
   SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);
   SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);
 
-  MPN_COPY (ap, s->xp, s->size);
-  MPN_COPY (bp, s->yp, s->size);
-  ap[s->size - 1] |= 1;
-  bp[s->size - 1] |= 1;
-
-  /* We must have a >= b */
-  if (mpn_cmp (ap, bp, s->size) < 0)
-    MP_PTR_SWAP (ap, bp);
+  s->xp[s->size - 1] |= 1;
+  s->yp[s->size - 1] |= 1;
 
   SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp);
-  mpn_hgcd_init (&hgcd, s->size, tmp1);
-  SPEED_TMP_ALLOC_LIMBS (tmp2, qstack_scratch, s->align_wp);
-  qstack_init (&quotients, s->size, tmp2, qstack_scratch);
+  mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);
   SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp);
 
   speed_starttime ();
   i = s->reps;
   do
     {
-      qstack_reset (&quotients, s->size);
-      res = mpn_hgcd (&hgcd, ap, s->size, bp, s->size,
-                      &quotients,
-                      wp, hgcd_scratch);
+      MPN_COPY (ap, s->xp, s->size);
+      MPN_COPY (bp, s->yp, s->size);
+      res = mpn_hgcd (ap, bp, s->size, &hgcd, wp);
     }
   while (--i != 0);
   t = speed_endtime ();
-#if WANT_ASSERT
-  if (res)
-    ASSERT_HGCD (&hgcd, ap, s->size, bp, s->size, 0, 4);
-#endif
   TMP_FREE;
   return t;
 }
-#if 0
+
 double
 speed_mpn_hgcd_lehmer (struct speed_params *s)
 {
   mp_ptr wp;
-  mp_size_t hgcd_init_scratch = mpn_hgcd_init_itch (s->size);
-  mp_size_t qstack_scratch = qstack_itch (s->size);
-  mp_size_t hgcd_scratch = mpn_hgcd_itch (s->size);
+  mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);
+  mp_size_t hgcd_scratch = MPN_HGCD_LEHMER_ITCH (s->size);
   mp_ptr ap;
   mp_ptr bp;
   mp_ptr tmp1, tmp2;
 
-  struct hgcd hgcd;
-  struct qstack quotients;
+  struct hgcd_matrix hgcd;
   int res;
   unsigned i;
   double t;
@@ -1084,45 +1122,33 @@ speed_mpn_hgcd_lehmer (struct speed_params *s)
   SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);
   SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);
 
-  MPN_COPY (ap, s->xp, s->size);
-  MPN_COPY (bp, s->yp, s->size);
-  ap[s->size - 1] |= 1;
-  bp[s->size - 1] |= 1;
-
-  /* We must have a >= b */
-  if (mpn_cmp (ap, bp, s->size) < 0)
-    MP_PTR_SWAP (ap, bp);
+  s->xp[s->size - 1] |= 1;
+  s->yp[s->size - 1] |= 1;
 
   SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp);
-  mpn_hgcd_init (&hgcd, s->size, tmp1);
-  SPEED_TMP_ALLOC_LIMBS (tmp2, qstack_scratch, s->align_wp);
-  qstack_init (&quotients, s->size, tmp2, qstack_scratch);
+  mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);
   SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp);
 
   speed_starttime ();
   i = s->reps;
   do
     {
-      qstack_reset (&quotients, s->size);
-      res = mpn_hgcd_lehmer (&hgcd, ap, s->size, bp, s->size,
-                             &quotients,
-                             wp, hgcd_scratch);
+      MPN_COPY (ap, s->xp, s->size);
+      MPN_COPY (bp, s->yp, s->size);
+      res = mpn_hgcd_lehmer (ap, bp, s->size, &hgcd, wp);
     }
   while (--i != 0);
   t = speed_endtime ();
-#if WANT_ASSERT
-  if (res)
-    ASSERT_HGCD (&hgcd, ap, s->size, bp, s->size, 0, 4);
-#endif
   TMP_FREE;
   return t;
 }
-#endif
+
 double
 speed_mpn_gcd (struct speed_params *s)
 {
   SPEED_ROUTINE_MPN_GCD (mpn_gcd);
 }
+#if 0
 double
 speed_mpn_gcd_binary (struct speed_params *s)
 {
@@ -1133,7 +1159,7 @@ speed_mpn_gcd_accel (struct speed_params *s)
 {
   SPEED_ROUTINE_MPN_GCD (mpn_gcd_accel);
 }
-
+#endif
 #if HAVE_NATIVE_mpn_gcd_finda
 double
 speed_mpn_gcd_finda (struct speed_params *s)
diff --git a/tune/speed.c b/tune/speed.c
index 90e3990de..abe9e70b8 100644
--- a/tune/speed.c
+++ b/tune/speed.c
@@ -255,17 +255,20 @@ const struct routine_t {
   { "mpn_popcount",      speed_mpn_popcount         },
   { "mpn_hamdist",       speed_mpn_hamdist          },
 
+  { "mpn_matrix22_mul",  speed_mpn_matrix22_mul     },
+
   { "mpn_hgcd",          speed_mpn_hgcd             },
-#if 0
   { "mpn_hgcd_lehmer",   speed_mpn_hgcd_lehmer      },
-#endif
+
   { "mpn_gcd_1",         speed_mpn_gcd_1,  FLAG_R_OPTIONAL },
   { "mpn_gcd_1N",        speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
 
   { "mpn_gcd",           speed_mpn_gcd                    },
+#if 0
   { "mpn_gcd_binary",    speed_mpn_gcd_binary             },
   { "mpn_gcd_accel",     speed_mpn_gcd_accel              },
   { "find_a",            speed_find_a,        FLAG_NODATA },
+#endif
 #if HAVE_NATIVE_mpn_gcd_finda
   { "mpn_gcd_finda",     speed_mpn_gcd_finda, FLAG_NODATA },
 #endif
diff --git a/tune/speed.h b/tune/speed.h
index c2055ca4a..ff8a8f73c 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -182,6 +182,7 @@ double speed_mpn_divrem_2 _PROTO ((struct speed_params *s));
 double speed_mpn_divrem_2_div _PROTO ((struct speed_params *s));
 double speed_mpn_divrem_2_inv _PROTO ((struct speed_params *s));
 double speed_mpn_fib2_ui _PROTO ((struct speed_params *s));
+double speed_mpn_matrix22_mul _PROTO ((struct speed_params *s));
 double speed_mpn_hgcd _PROTO ((struct speed_params *s));
 double speed_mpn_hgcd_lehmer _PROTO ((struct speed_params *s));
 double speed_mpn_gcd _PROTO ((struct speed_params *s));
diff --git a/tune/tuneup.c b/tune/tuneup.c
index fa6778dba..6d1acf9e0 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -162,10 +162,11 @@ mp_size_t  mullow_mul_n_threshold       = MP_SIZE_T_MAX;
 mp_size_t  div_sb_preinv_threshold      = MP_SIZE_T_MAX;
 mp_size_t  div_dc_threshold             = MP_SIZE_T_MAX;
 mp_size_t  powm_threshold               = MP_SIZE_T_MAX;
-mp_size_t  hgcd_schoenhage_threshold    = MP_SIZE_T_MAX;
+mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
+mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
 mp_size_t  gcd_accel_threshold          = MP_SIZE_T_MAX;
-mp_size_t  gcd_schoenhage_threshold     = MP_SIZE_T_MAX;
-mp_size_t  gcdext_schoenhage_threshold  = MP_SIZE_T_MAX;
+mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
+mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
 mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
 mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
 mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
@@ -1007,17 +1008,27 @@ tune_powm (void)
 
 
 void
+tune_matrix22_mul (void)
+{
+  static struct param_t  param;
+  param.name = "MATRIX22_STRASSEN_THRESHOLD";
+  param.function = speed_mpn_matrix22_mul;
+  param.min_size = 2;
+  one (&matrix22_strassen_threshold, &param);
+}
+
+void
 tune_hgcd (void)
 {
   static struct param_t  param;
-  param.name = "HGCD_SCHOENHAGE_THRESHOLD";
+  param.name = "HGCD_THRESHOLD";
   param.function = speed_mpn_hgcd;
   /* We seem to get strange results for small sizes */
-  param.min_size = 50;
-  param.step_factor = 0.05;
-  one (&hgcd_schoenhage_threshold, &param);
+  param.min_size = 30;
+  one (&hgcd_threshold, &param);
 }
 
+#if 0
 void
 tune_gcd_accel (void)
 {
@@ -1027,29 +1038,29 @@ tune_gcd_accel (void)
   param.min_size = 1;
   one (&gcd_accel_threshold, &param);
 }
-
+#endif
 void
-tune_gcd_schoenhage (void)
+tune_gcd_dc (void)
 {
   static struct param_t  param;
-  param.name = "GCD_SCHOENHAGE_THRESHOLD";
+  param.name = "GCD_DC_THRESHOLD";
   param.function = speed_mpn_gcd;
-  param.min_size = hgcd_schoenhage_threshold;
+  param.min_size = hgcd_threshold;
   param.max_size = 3000;
   param.step_factor = 0.1;
-  one (&gcd_schoenhage_threshold, &param);
+  one (&gcd_dc_threshold, &param);
 }
 
 void
-tune_gcdext_schoenhage (void)
+tune_gcdext_dc (void)
 {
   static struct param_t  param;
-  param.name = "GCDEXT_SCHOENHAGE_THRESHOLD";
+  param.name = "GCDEXT_DC_THRESHOLD";
   param.function = speed_mpn_gcdext;
-  param.min_size = hgcd_schoenhage_threshold;
+  param.min_size = hgcd_threshold;
   param.max_size = 3000;
   param.step_factor = 0.1;
-  one (&gcdext_schoenhage_threshold, &param);
+  one (&gcdext_dc_threshold, &param);
 }
 
 
@@ -1771,10 +1782,13 @@ all (void)
   tune_powm ();
   printf("\n");
 
+  tune_matrix22_mul ();
   tune_hgcd ();
+  tune_gcd_dc ();
+  tune_gcdext_dc ();
+#if 0
   tune_gcd_accel ();
-  tune_gcd_schoenhage ();
-  tune_gcdext_schoenhage ();
+#endif
   tune_jacobi_base ();
   printf("\n");