summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.hgignore6
-rw-r--r--ChangeLog222
-rw-r--r--configure.in7
-rw-r--r--gmp-h.in3
-rw-r--r--gmp-impl.h284
-rw-r--r--mpn/Makefile.am5
-rw-r--r--mpn/alpha/ev5/gmp-mparam.h6
-rw-r--r--mpn/alpha/ev6/gmp-mparam.h4
-rw-r--r--mpn/alpha/ev6/nails/gmp-mparam.h5
-rw-r--r--mpn/alpha/gmp-mparam.h4
-rw-r--r--mpn/arm/gmp-mparam.h1
-rw-r--r--mpn/cray/gmp-mparam.h5
-rw-r--r--mpn/cray/ieee/gmp-mparam.h5
-rw-r--r--mpn/generic/gcd.c989
-rw-r--r--mpn/generic/gcd_lehmer.c161
-rw-r--r--mpn/generic/gcd_subdiv_step.c116
-rw-r--r--mpn/generic/gcdext.c1551
-rw-r--r--mpn/generic/gcdext_1.c319
-rw-r--r--mpn/generic/gcdext_lehmer.c162
-rw-r--r--mpn/generic/gcdext_subdiv_step.c188
-rw-r--r--mpn/generic/hgcd.c2381
-rw-r--r--mpn/generic/hgcd2.c639
-rw-r--r--mpn/generic/matrix22_mul.c254
-rw-r--r--mpn/ia64/gmp-mparam.h4
-rw-r--r--mpn/m68k/gmp-mparam.h5
-rw-r--r--mpn/minithres/gmp-mparam.h4
-rw-r--r--mpn/mips32/gmp-mparam.h1
-rw-r--r--mpn/mips64/gmp-mparam.h5
-rw-r--r--mpn/pa32/gmp-mparam.h1
-rw-r--r--mpn/pa32/hppa1_1/gmp-mparam.h5
-rw-r--r--mpn/pa32/hppa2_0/gmp-mparam.h1
-rw-r--r--mpn/pa64/gmp-mparam.h5
-rw-r--r--mpn/power/gmp-mparam.h5
-rw-r--r--mpn/powerpc32/750/gmp-mparam.h5
-rw-r--r--mpn/powerpc32/gmp-mparam.h5
-rw-r--r--mpn/powerpc64/gmp-mparam.h5
-rw-r--r--mpn/s390/gmp-mparam.h1
-rw-r--r--mpn/sparc32/gmp-mparam.h1
-rw-r--r--mpn/sparc32/v8/gmp-mparam.h5
-rw-r--r--mpn/sparc32/v8/supersparc/gmp-mparam.h5
-rw-r--r--mpn/sparc32/v9/gmp-mparam.h5
-rw-r--r--mpn/sparc64/gmp-mparam.h1
-rw-r--r--mpn/vax/gmp-mparam.h1
-rw-r--r--mpn/x86/i486/gmp-mparam.h1
-rw-r--r--mpn/x86/k6/gmp-mparam.h5
-rw-r--r--mpn/x86/k7/gmp-mparam.h4
-rw-r--r--mpn/x86/p6/gmp-mparam.h1
-rw-r--r--mpn/x86/p6/mmx/gmp-mparam.h4
-rw-r--r--mpn/x86/pentium/gmp-mparam.h5
-rw-r--r--mpn/x86/pentium/mmx/gmp-mparam.h4
-rw-r--r--mpn/x86/pentium4/sse2/gmp-mparam.h4
-rw-r--r--mpn/x86_64/core2/gmp-mparam.h8
-rw-r--r--mpn/x86_64/gmp-mparam.h7
-rw-r--r--mpn/x86_64/pentium4/gmp-mparam.h4
-rw-r--r--tests/mpn/Makefile.am2
-rw-r--r--tests/mpn/t-hgcd.c288
-rw-r--r--tests/mpn/t-matrix22.c207
-rw-r--r--tests/mpz/t-gcd.c7
-rw-r--r--tune/Makefile.am4
-rw-r--r--tune/common.c126
-rw-r--r--tune/speed.c7
-rw-r--r--tune/speed.h1
-rw-r--r--tune/tuneup.c50
63 files changed, 3274 insertions, 4852 deletions
diff --git a/.hgignore b/.hgignore
index 71cb23983..817593cbf 100644
--- a/.hgignore
+++ b/.hgignore
@@ -37,9 +37,9 @@ Makefile
^doc/version\.texi
# All source files in mpn/ are either generated, or links
-^mpn/.*\.c
-^mpn/.*\.asm
-^
+^mpn/[^/]*\.c
+^mpn/[^/]*\.asm
+
^\.libs
.*\.a
diff --git a/ChangeLog b/ChangeLog
index 22118506a..79f9b13e2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -29,6 +29,38 @@
(DIVEXACT_BY3_METHOD): Don't default to 0 if
HAVE_NATIVE_mpn_divexact_by3c.
+2008-09-18 Niels Möller <nisse@lysator.liu.se>
+
+ * mpn/generic/gcd.c (main): Added code for tuning of CHOOSE_P.
+
+ * mpn/generic/hgcd.c (mpn_hgcd_matrix_mul): Assert that inputs are
+ normalized.
+
+2008-09-17 Niels Möller <nisse@lysator.liu.se> <nisse@king.swox.se>
+
+ * mpn/generic/gcdext.c (mpn_gcdext): p = n/5 caused a
+ slowdown for large inputs. As a compromise, use p = n/2 for the
+ first iteration, and p = n/3 for the rest. Handle the first
+ iteration specially, since the initial u0 and u1 are trivial.
+
+ * mpn/x86_64/gmp-mparam.h (GCDEXT_DC_THRESHOLD): Reduced threshold
+ from 409 to 390.
+
+ * mpn/generic/gcdext.c (CHOOSE_P): New macro. Use p = n/5.
+ (mpn_gcdext): Use CHOOSE_P, and generalized the calculation of
+ scratch space.
+
+ * tune/tuneup.c (tune_hgcd): Use default step factor.
+
+ * mpn/x86_64/gmp-mparam.h: (GCD_DC_THRESHOLD): Reduced from 493 to
+ 412.
+
+ * mpn/generic/gcd.c (CHOOSE_P): New macro, to determine the
+ split when calling hgcd. Use p = 2n/3, as that seems better than
+ the more obvious split p = n/2.
+ (mpn_gcd): Use CHOOSE_P, and generalized the calculation of
+ scratch space.
+
2008-09-16 Torbjorn Granlund <tege@swox.com>
* mpn/generic/toom_interpolate_7pts.c: Use new mpn_divexact_byN
@@ -55,14 +87,175 @@
Choose function depending on DIVEXACT_BY3_METHOD.
* gmp-impl.h (DIVEXACT_BY3_METHOD): Provide default.
+2008-09-16 Niels Möller <nisse@lysator.liu.se>
+
+ * mpn/generic/hgcd.c (mpn_hgcd_addmul2_n): Moved function to
+ gcdext.c, where it is used.
+ * mpn/generic/gcdext.c (addmul2_n): Moved and renamed, was
+ mpn_hgcd_addmul2_n. Made static. Deleted input normalization.
+ Deleted rn argument.
+ (mpn_gcdext): Updated calls to addmul2_n, and added assertions.
+
+ * gmp-impl.h (MPN_HGCD_MATRIX_INIT_ITCH): Increased storage by four limbs.
+ (MPN_HGCD_LEHMER_ITCH): Reduced storage by one limb.
+ (MPN_GCD_SUBDIV_STEP_ITCH): Likewise.
+ (MPN_GCD_LEHMER_N_ITCH): Likewise.
+
+ * mpn/generic/hgcd.c (mpn_hgcd_matrix_init): Use two extra limbs.
+ (hgcd_step): Use overlapping arguments to mpn_tdiv_qr.
+ (mpn_hgcd_matrix_mul): Deleted normalization code. Tigher bounds
+ for the element size of the product. Needs two extra limbs of
+ storage for the elements.
+ (mpn_hgcd_itch): Updated storage calculation.
+
+ * mpn/generic/gcd_subdiv_step.c (mpn_gcd_subdiv_step): Use
+ overlapping arguments to mpn_tdiv_qr. Use mpn_zero_p.
+
+ * mpn/generic/gcd.c (mpn_gcd): Use mpn_zero_p.
+
+2008-09-15 Niels Möller <nisse@lysator.liu.se>
+
+ * mpn/generic/hgcd.c (mpn_hgcd_matrix_init): Updated for deleted
+ tp pointer.
+ (hgcd_matrix_update_q): Likewise.
+ (mpn_hgcd_matrix_mul): Likewise.
+ (mpn_hgcd_itch): Updated calculation of scratch space.
+
+ * gmp-impl.h (struct hgcd_matrix): Deleted tp pointer.
+ (MPN_HGCD_MATRIX_INIT_ITCH): Reduced storage.
+ (mpn_hgcd_step, MPN_HGCD_STEP_ITCH): Deleted declarations.
+
+2008-09-15 Niels Möller <nisse@lysator.liu.se> <nisse@king.swox.se>
+
+ * mpn/x86_64/gmp-mparam.h (MATRIX22_STRASSEN_THRESHOLD): New
+ threshold.
+
+ * mpn/generic/hgcd.c (mpn_hgcd_matrix_mul): Use mpn_matrix22_mul.
+ (mpn_hgcd_itch): Updated calculation of scratch space. Use
+ count_leading_zeros to get the recursion depth.
+
+ * mpn/generic/gcd.c (mpn_gcd): Fixed calculation of scratch space,
+ and use mpn_hgcd_itch.
+
+2008-09-15 Niels Möller <nisse@lysator.liu.se>
+
+ * tune/tuneup.c (tune_matrix22_mul): New function.
+ (all): Use it.
+
+ * tune/common.c (speed_mpn_matrix22_mul): New function.
+
+ * tune/Makefile.am (TUNE_MPN_SRCS_BASIC): Added matrix22_mul.c.
+
+ * tests/mpn/t-matrix22.c: Use MATRIX22_STRASSEN_THRESHOLD to
+ select sizes for tests.
+
+ * gmp-impl.h (MATRIX22_STRASSEN_THRESHOLD): New threshold
+
+ * configure.in (gmp_mpn_functions): Added matrix22_mul.
+ * gmp-impl.h: Added declarations for mpn_matrix22_mul and related
+ functions.
+
+ * mpn/Makefile.am (nodist_EXTRA_libmpn_la_SOURCES): Added
+ matrix22_mul.c.
+ * tests/mpn/Makefile.am (check_PROGRAMS): Added t-matrix22.
+
+ * tests/mpn/t-matrix22.c: New file.
+ * mpn/generic/matrix22_mul.c: New file.
+
+2008-09-11 Niels Möller <nisse@king.swox.se>
+
+ * tune/tuneup.c: Updated tuning of gcdext.
+
+ * mpn/x86_64/gmp-mparam.h (GCDEXT_DC_THRESHOLD): Reduced threshold
+ from 713 to 409.
+
+2008-09-11 Niels Möller <nisse@lysator.liu.se>
+
+ * gmp-impl.h: Updated for gcdext changes.
+ (GCDEXT_DC_THRESHOLD): New constant, renamed from
+ GCDEXT_SCHOENHAGE_THRESHOLD.
+
+ * mpn/generic/gcdext.c (compute_v): Accept non-normalized a and b
+ as inputs.
+ (mpn_gcdext): Rewrote and simplified. Now uses the new mpn_hgcd
+ interface.
+
+ * mpn/generic/hgcd.c (mpn_hgcd_addmul2_n): Renamed from addmul2_n
+ and made non-static. Changed interface to take non-normalized
+ inputs, and only two size arguments.
+ (mpn_hgcd_matrix_mul): Simplified using new mpn_hgcd_addmul2_n.
+
+ * mpn/generic/gcdext_lehmer.c (mpn_gcdext_lehmer_itch): Deleted
+ function.
+ (mpn_gcdext_lehmer_n): Renamed from mpn_gcd_lehmer. Now takes
+ inputs of equal size. Moved the code for the division step to a
+ separate function...
+ * mpn/generic/gcdext_subdiv_step.c (mpn_gcdext_subdiv_step): New
+ file, new function.
+
+ * configure.in (gmp_mpn_functions): Added gcdext_subdiv_step.
+
2008-09-10 Torbjorn Granlund <tege@swox.com>
* gmp-h.in: Unconditionally include <cstdio>.
+2008-09-10 Niels Möller <nisse@lysator.liu.se>
+
+ * tune/common.c: #if:ed out speed_mpn_gcd_binary and
+ speed_mpn_gcd_accel.
+ * tune/speed.c (routine): #if:ed out mpn_gcd_binary, mpn_gcd_accel
+ and find_a.
+ * tune/Makefile.am (libspeed_la_SOURCES): Removed gcd_bin.c
+ gcd_accel.c gcd_finda_gen.c.
+ * tune/tuneup.c: Enable tuning of GCD_DC_THRESHOLD.
+
+ * mpn/generic/gcd.c (mpn_gcd): Rewrote and simplified. Now uses
+ the new mpn_hgcd interface.
+
+ * */gmp-mparam.h: Renamed GCD_SCHOENHAGE_THRESHOLD to
+ GCD_DC_THRESHOLD.
+
+ * mpn/generic/gcd_lehmer.c (mpn_gcd_lehmer_n): Renamed (was
+ mpn_gcd_lehmer). Now takes inputs of equal size.
+
+ * mpn/generic/gcd_lehmer.c (mpn_gcd_lehmer): Reintroduced gcd_2,
+ to get better performance for small inputs.
+
+ * mpn/generic/hgcd.c: Don't hardcode small HGCD_THRESHOLD.
+ * mpn/x86_64/gmp-mparam.h (HGCD_THRESHOLD): Reduced from 145 to
+ 120.
+ * */gmp-mparam.h: Renamed HGCD_SCHOENHAGE_THRESHOLD to
+ HGCD_THRESHOLD.
+
2008-09-09 Torbjorn Granlund <tege@swox.com>
* doc/gmp.texi: Fix a typo and clarify mpn_gcdext docs.
+2008-09-09 Niels Möller <nisse@lysator.liu.se>
+
+ * tune/common.c (speed_mpn_hgcd, speed_mpn_hgcd_lehmer): Adapted
+ to new hgcd interface.
+
+ * gmp-impl.h (MPN_HGCD_LEHMER_ITCH): New macro.
+
+ * hgcd.c (mpn_hgcd_lehmer): Renamed function, from hgcd_base. Made
+ non-static.
+
+ * gcd_lehmer.c (mpn_gcd_lehmer): Use hgcd2 also for n == 2.
+
+ * gcdext_lehmer.c (mpn_gcdext_lehmer): Simplified code for
+ division step. Added proper book-keeping of swaps, which affect
+ the sign of the returned cofactor.
+
+ * tests/mpz/t-gcd.c (one_test): Display co-factor when mpn_gcdext
+ fails.
+
+ * gcd_lehmer.c (mpn_gcd_lehmer): At end of loop, need to handle
+ the special case n == 1 correctly.
+
+ * gcd_subdiv_step.c (mpn_gcd_subdiv_step): Simplified function.
+ The special cancellation logic is not needed here.
+
2008-09-08 Torbjorn Granlund <tege@swox.com>
* mpn/generic/invert.c: Add working but slow code.
@@ -94,6 +287,26 @@
* gmp-h.in (__GMP_CC): New #define.
(__GMP_CFLAGS): New #define.
+2008-09-08 Niels Möller <nisse@lysator.liu.se>
+
+ * tests/mpn/t-hgcd.c: Updated tests. Rewrite of hgcd_ref.
+
+ * mpn/generic/gcdext_lehmer.c (mpn_gcdext_lehmer_itch): New function.
+ (mpn_gcdext_lehmer): Various bugfixes.
+
+ * gcdext.c (mpn_gcdext): Allocate scratch space for gcdext_lehmer.
+
+ * mpn/generic/gcd_lehmer.c (gcd_2): ASSERT that inputs are odd.
+ (mpn_gcd_lehmer): Added tp argument, for scratch space. Make both
+ arguments odd before calling gcd_2.
+
+ * mpn/generic/hgcd.c (mpn_hgcd): Allow the trivial case n <= 2,
+ and return 0 immediately.
+
+ * gmp-impl.h (MPN_EXTRACT_NUMB): New macro.
+
+ * configure.in (gmp_mpn_functions): Added gcdext_lehmer.
+
2008-09-05 Torbjorn Granlund <tege@swox.com>
* mpn/generic/toom_interpolate_7pts.c: Use mpn_divexact_by3c instead of
@@ -856,6 +1069,12 @@
* mpn/generic/mul_fft.c: Optimize many scalar divisions and mod
operations into masks and shifts.
(mpn_fft_mul_modF_K): Fix a spurious ASSERT_NOCARRY.
+ (mpn_fft_belge_butterfly, mpn_fft_fft_belgeRec, mpn_fft_fft_belge,
+ mpn_fft_fft_belgeInvRec, mpn_fft_fft_belgeInv): Add Pierrick Gaudry's
+ implementation of the cache-optimized "belge" FFT code.
+ (mpn_fft_fft_sqr, mpn_fft_butterfly, mpn_fft_fft, mpn_fft_fftinv):
+ Remove.
+ (mpn_mul_fft_internal): Corresponding updates.
2006-03-26 Torbjorn Granlund <tege@swox.com>
@@ -1187,6 +1406,9 @@
* tests/mpz/reuse.c: Test mpz_rootrem.
+ From Paul Zimmermann:
+ * mpn/generic/rootrem.c: Complete rewrite.
+
2005-10-31 Torbjorn Granlund <tege@swox.com>
* mpz/pprime_p.c (mpz_probab_prime_p): Considerably limit trial
diff --git a/configure.in b/configure.in
index fe0584285..7eea50ee4 100644
--- a/configure.in
+++ b/configure.in
@@ -2407,8 +2407,11 @@ gmp_mpn_functions="$extra_functions \
fib2_ui mod_1 mod_34lsub1 mode1o pre_divrem_1 pre_mod_1 dump \
mul mul_fft mul_n mul_basecase sqr_basecase random random2 pow_1 \
rootrem sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \
- bdivmod gcd_1 gcd gcdext tdiv_qr dc_divrem_n sb_divrem_mn jacbase get_d \
- hgcd2 hgcd qstack mullow_n mullow_basecase \
+ bdivmod gcd_1 gcd gcdext_1 gcdext gcd_lehmer gcd_subdiv_step \
+ gcdext_lehmer gcdext_subdiv_step \
+ tdiv_qr dc_divrem_n sb_divrem_mn jacbase get_d \
+ matrix22_mul \
+ hgcd2 hgcd mullow_n mullow_basecase \
mul_toom22 mul_toom32 mul_toom42 mul_toom62 mul_toom53 mul_toom44 \
toom_interpolate_5pts toom_interpolate_7pts invert binvert \
sb_div_qr sb_divappr_q sb_div_q dc_div_qr dc_divappr_q dc_div_q \
diff --git a/gmp-h.in b/gmp-h.in
index 0488a5ad5..99ba5b3e5 100644
--- a/gmp-h.in
+++ b/gmp-h.in
@@ -1505,6 +1505,9 @@ __GMP_DECLSPEC mp_size_t mpn_gcd __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr
#define mpn_gcd_1 __MPN(gcd_1)
__GMP_DECLSPEC mp_limb_t mpn_gcd_1 __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t)) __GMP_ATTRIBUTE_PURE;
+#define mpn_gcdext_1 __MPN(gcdext_1)
+__GMP_DECLSPEC mp_limb_t mpn_gcdext_1 __GMP_PROTO ((mp_ptr, mp_ptr, mp_limb_t, mp_limb_t)) __GMP_ATTRIBUTE_PURE;
+
#define mpn_gcdext __MPN(gcdext)
__GMP_DECLSPEC mp_size_t mpn_gcdext __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
diff --git a/gmp-impl.h b/gmp-impl.h
index 0433e8527..4dcfc6497 100644
--- a/gmp-impl.h
+++ b/gmp-impl.h
@@ -71,6 +71,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
mp_limb_t name __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t))
#define DECL_gcd_1(name) \
mp_limb_t name __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t))
+#define DECL_gcdext_1(name) \
+ mp_limb_t name __GMP_PROTO ((mp_ptr, mp_ptr, mp_limb_t, mp_limb_t))
#define DECL_lshift(name) \
mp_limb_t name __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, unsigned))
#define DECL_mod_1(name) \
@@ -3439,176 +3441,156 @@ void __gmp_invalid_operation _PROTO ((void)) ATTRIBUTE_NORETURN;
} \
} while (0)
-
-/* HGCD definitions */
-
-/* Limited by 2 + twice the bitsize of mp_size_t */
-#define QSTACK_MAX_QUOTIENTS 82
-
-/* Name mangling */
-#define qstack_itch __gmpn_qstack_itch
-#define qstack_init __gmpn_qstack_init
-#define qstack_reset __gmpn_qstack_reset
-#define qstack_rotate __gmpn_qstack_rotate
-
-#define mpn_hgcd2 __gmpn_hgcd2
-#define mpn_hgcd2_fix __gmpn_hgcd2_fix
-#define mpn_hgcd2_lehmer_step __gmpn_hgcd2_lehmer_step
-#define mpn_hgcd_max_recursion __gmpn_hgcd_max_recursion
-#define mpn_hgcd_init_itch __gmpn_hgcd_init_itch
-#define mpn_hgcd_init __gmpn_hgcd_init
-#define mpn_hgcd_lehmer_itch __gmpn_hgcd_lehmer_itch
-#define mpn_hgcd_lehmer __gmpn_hgcd_lehmer
-#define mpn_hgcd_itch __gmpn_hgcd_itch
-#define mpn_hgcd __gmpn_hgcd
-#define mpn_hgcd_equal __gmpn_hgcd_equal
-#define mpn_hgcd_fix __gmpn_hgcd_fix
-
-struct qstack
-{
- /* Throughout the code we represent q = 1 with qsize = 0. */
- mp_size_t size[QSTACK_MAX_QUOTIENTS];
- mp_ptr limb;
- mp_size_t limb_alloc;
-
- /* Number of quotients to keep when we discard old quotients */
- unsigned nkeep;
-
- /* Top quotient is of size size[size_next-1], and starts at
- limb+limb_next - size[size_next-1]. We use size_next == 0 for an
- empty stack.*/
- unsigned size_next;
- mp_size_t limb_next;
-};
+/* Matrix multiplication */
+#define mpn_matrix22_mul __MPN(matrix22_mul)
+#define mpn_matrix22_strassen __MPN(matrix22_mul_strassen)
+#define mpn_matrix22_mul_itch __MPN(matrix22_mul_itch)
mp_size_t
-qstack_itch __GMP_PROTO ((mp_size_t));
+mpn_matrix22_mul_itch (mp_size_t, mp_size_t);
void
-qstack_init __GMP_PROTO ((struct qstack *, mp_size_t, mp_limb_t *, mp_size_t));
-
+mpn_matrix22_mul (mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t,
+ mp_srcptr, mp_srcptr, mp_srcptr, mp_srcptr, mp_size_t,
+ mp_ptr);
void
-qstack_reset __GMP_PROTO ((struct qstack *, mp_size_t));
+mpn_matrix22_mul_strassen (mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t,
+ mp_srcptr, mp_srcptr, mp_srcptr, mp_srcptr, mp_size_t,
+ mp_ptr);
-void
-qstack_rotate __GMP_PROTO ((struct qstack *, mp_size_t));
-
-#if WANT_ASSERT
-void
-__gmpn_qstack_sanity __GMP_PROTO ((struct qstack *));
-#define ASSERT_QSTACK __gmpn_qstack_sanity
-#else
-#define ASSERT_QSTACK(stack)
+#ifndef MATRIX22_STRASSEN_THRESHOLD
+#define MATRIX22_STRASSEN_THRESHOLD 30
#endif
-struct hgcd2_row
-{
- /* r = (-)u a + (-)v b */
- mp_limb_t u;
- mp_limb_t v;
-};
+/* HGCD definitions */
+
+/* Extract one numb, shifting count bits left
+ ________ ________
+ |___xh___||___xl___|
+ |____r____|
+ >count <
+
+ The count includes any nail bits, so it should work fine if count
+ is computed using count_leading_zeros. If GMP_NAIL_BITS > 0, all of
+ xh, xl and r include nail bits. Must have 0 < count < GMP_LIMB_BITS.
-struct hgcd2
+ FIXME: Omit masking with GMP_NUMB_MASK, and let callers do that for
+ those calls where the count high bits of xh may be non-zero.
+*/
+
+#define MPN_EXTRACT_NUMB(count, xh, xl) \
+ ((((xh) << ((count) - GMP_NAIL_BITS)) & GMP_NUMB_MASK) | \
+ ((xl) >> (GMP_LIMB_BITS - (count))))
+
+#define mpn_hgcd2 __MPN (hgcd2)
+#define mpn_hgcd_mul_matrix1_vector __MPN (hgcd_mul_matrix1_vector)
+#define mpn_hgcd_mul_matrix1_inverse_vector __MPN (hgcd_mul_matrix1_inverse_vector)
+
+#define mpn_hgcd_matrix_init __MPN (hgcd_matrix_init)
+#define mpn_hgcd_matrix_mul __MPN (hgcd_matrix_mul)
+#define mpn_hgcd_matrix_adjust __MPN (hgcd_matrix_adjust)
+
+#define mpn_hgcd_step __MPN (hgcd_step)
+#define mpn_hgcd_itch __MPN (hgcd_itch)
+#define mpn_hgcd __MPN (hgcd)
+#define mpn_hgcd_lehmer __MPN (hgcd_lehmer)
+
+#define mpn_gcd_lehmer_n __MPN(gcd_lehmer_n)
+#define mpn_gcd_subdiv_step __MPN(gcd_subdiv_step)
+#define mpn_gcdext_lehmer_n __MPN(gcdext_lehmer_n)
+#define mpn_gcdext_subdiv_step __MPN(gcdext_subdiv_step)
+
+/* The matrix non-negative M = (u, u'; v,v') keeps track of the
+ reduction (a;b) = M (alpha; beta) where alpha, beta are smaller
+ than a, b. The determinant must always be one, so that M has an
+ inverse (v', -u'; -v, u). Elements always fit in GMP_NUMB_BITS - 1
+ bits. */
+struct hgcd_matrix1
{
- /* Sign of the first row, sign >= 0 implies that u >= 0 and v <= 0,
- sign < 0 implies u <= 0, v >= 0 */
- int sign;
- struct hgcd2_row row[4];
+ mp_limb_t u[2][2];
};
int
-mpn_hgcd2 __GMP_PROTO ((struct hgcd2 *,
- mp_limb_t, mp_limb_t,
- mp_limb_t, mp_limb_t,
- struct qstack *));
+mpn_hgcd2 __GMP_PROTO ((mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t,
+ struct hgcd_matrix1 *));
mp_size_t
-mpn_hgcd2_fix __GMP_PROTO ((mp_ptr, mp_size_t,
- int,
- mp_limb_t, mp_srcptr, mp_size_t,
- mp_limb_t, mp_srcptr, mp_size_t));
-
-int
-mpn_hgcd2_lehmer_step __GMP_PROTO ((struct hgcd2 *,
- mp_srcptr, mp_size_t,
- mp_srcptr, mp_size_t,
- struct qstack *));
-
-unsigned
-mpn_hgcd_max_recursion __GMP_PROTO ((mp_size_t));
+mpn_hgcd_mul_matrix1_vector __GMP_PROTO ((struct hgcd_matrix1 *, mp_size_t,
+ mp_ptr, mp_ptr, mp_ptr));
-struct hgcd_row
-{
- /* [rp, rsize] should always be normalized. */
- mp_ptr rp; mp_size_t rsize;
- mp_ptr uvp[2];
-};
+mp_size_t
+mpn_hgcd_mul_matrix1_inverse_vector __GMP_PROTO ((struct hgcd_matrix1 *, mp_size_t,
+ mp_ptr, mp_ptr, mp_ptr));
-struct hgcd
+struct hgcd_matrix
{
- int sign;
- /* Space allocated for the uv entries, for sanity checking */
+ /* For sanity checking only */
mp_size_t alloc;
- /* Size of the largest u,v entry, usually row[3].uvp[1]. This
- element should be normalized. Smaller elements must be zero
- padded, and all unused limbs (i.e. between size and alloc) must
- be zero. */
- mp_size_t size;
- struct hgcd_row row[4];
+
+ mp_size_t n;
+ mp_ptr p[2][2];
};
-mp_size_t
-mpn_hgcd_init_itch __GMP_PROTO ((mp_size_t));
+#define MPN_HGCD_MATRIX_INIT_ITCH(n) (4 * ((n+1)/2 + 1))
void
-mpn_hgcd_init __GMP_PROTO ((struct hgcd *,
- mp_size_t,
- mp_limb_t *));
+mpn_hgcd_matrix_init __GMP_PROTO ((struct hgcd_matrix *, mp_size_t, mp_ptr));
+void
+mpn_hgcd_matrix_mul __GMP_PROTO ((struct hgcd_matrix *, const struct hgcd_matrix *,
+ mp_ptr));
mp_size_t
-mpn_hgcd_lehmer_itch __GMP_PROTO ((mp_size_t));
-
-int
-mpn_hgcd_lehmer __GMP_PROTO ((struct hgcd *,
- mp_srcptr, mp_size_t,
- mp_srcptr, mp_size_t,
- struct qstack *,
- mp_ptr, mp_size_t));
+mpn_hgcd_matrix_adjust __GMP_PROTO ((struct hgcd_matrix *,
+ mp_size_t, mp_ptr, mp_ptr,
+ mp_size_t, mp_ptr));
mp_size_t
mpn_hgcd_itch __GMP_PROTO ((mp_size_t));
-int
-mpn_hgcd __GMP_PROTO ((struct hgcd *,
- mp_srcptr, mp_size_t,
- mp_srcptr, mp_size_t,
- struct qstack *,
- mp_ptr, mp_size_t));
+mp_size_t
+mpn_hgcd __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t,
+ struct hgcd_matrix *, mp_ptr));
-#if WANT_ASSERT
-void
-__gmpn_hgcd_sanity __GMP_PROTO ((const struct hgcd *,
- mp_srcptr, mp_size_t,
- mp_srcptr, mp_size_t,
- unsigned, unsigned));
-#define ASSERT_HGCD __gmpn_hgcd_sanity
-#else
-#define ASSERT_HGCD(hgcd, ap, asize, bp, bsize, start, end)
-#endif
+#define MPN_HGCD_LEHMER_ITCH(n) (n)
-int
-mpn_hgcd_equal __GMP_PROTO ((const struct hgcd *, const struct hgcd *));
+mp_size_t
+mpn_hgcd_lehmer __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t,
+ struct hgcd_matrix *, mp_ptr));
+
+/* Needs storage for the quotient */
+#define MPN_GCD_SUBDIV_STEP_ITCH(n) (n)
+
+mp_size_t
+mpn_gcd_subdiv_step __GMP_PROTO ((mp_ptr, mp_size_t *,
+ mp_ptr, mp_ptr, mp_size_t, mp_ptr));
+
+#define MPN_GCD_LEHMER_N_ITCH(n) (n)
+
+mp_size_t
+mpn_gcd_lehmer_n __GMP_PROTO ((mp_ptr, mp_ptr, mp_ptr, mp_size_t,
+ mp_ptr));
+
+/* To calculate the needed scratch space, n should be a bound for both
+ input and output sizes. */
+#define MPN_GCDEXT_SUBDIV_ITCH(n) (2*(n) + 1)
+
+mp_size_t
+mpn_gcdext_subdiv_step __GMP_PROTO ((mp_ptr, mp_size_t *, mp_ptr, mp_size_t *,
+ mp_ptr, mp_ptr, mp_size_t,
+ mp_ptr, mp_ptr, mp_size_t *, mp_ptr));
+
+#define MPN_GCDEXT_LEHMER_N_ITCH(n) (4*(n) + 3)
mp_size_t
-mpn_hgcd_fix __GMP_PROTO ((mp_size_t,
- mp_ptr, mp_size_t,
- int, mp_size_t,
- const struct hgcd_row *,
- mp_srcptr, mp_srcptr,
- mp_ptr, mp_size_t));
+mpn_gcdext_lehmer_n __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *,
+ mp_ptr, mp_ptr, mp_size_t,
+ mp_ptr));
+
+/* 4*(an + 1) + 4*(bn + 1) + an */
+#define MPN_GCDEXT_LEHMER_ITCH(an, bn) (5*(an) + 4*(bn) + 8)
-#ifndef HGCD_SCHOENHAGE_THRESHOLD
-#define HGCD_SCHOENHAGE_THRESHOLD 150
+#ifndef HGCD_THRESHOLD
+#define HGCD_THRESHOLD 400
#endif
#if 0
@@ -3617,12 +3599,12 @@ mpn_hgcd_fix __GMP_PROTO ((mp_size_t,
#endif
#endif
-#ifndef GCD_SCHOENHAGE_THRESHOLD
-#define GCD_SCHOENHAGE_THRESHOLD 1000
+#ifndef GCD_DC_THRESHOLD
+#define GCD_DC_THRESHOLD 1000
#endif
-#ifndef GCDEXT_SCHOENHAGE_THRESHOLD
-#define GCDEXT_SCHOENHAGE_THRESHOLD 600
+#ifndef GCDEXT_DC_THRESHOLD
+#define GCDEXT_DC_THRESHOLD 600
#endif
/* Definitions for mpn_set_str and mpn_get_str */
@@ -4044,9 +4026,13 @@ extern mp_size_t div_dc_threshold;
#define POWM_THRESHOLD powm_threshold
extern mp_size_t powm_threshold;
-#undef HGCD_SCHOENHAGE_THRESHOLD
-#define HGCD_SCHOENHAGE_THRESHOLD hgcd_schoenhage_threshold
-extern mp_size_t hgcd_schoenhage_threshold;
+#undef MATRIX22_STRASSEN_THRESHOLD
+#define MATRIX22_STRASSEN_THRESHOLD matrix22_strassen_threshold
+extern mp_size_t matrix22_strassen_threshold;
+
+#undef HGCD_THRESHOLD
+#define HGCD_THRESHOLD hgcd_threshold
+extern mp_size_t hgcd_threshold;
#undef GCD_ACCEL_THRESHOLD
#define GCD_ACCEL_THRESHOLD gcd_accel_threshold
@@ -4058,13 +4044,13 @@ extern mp_size_t gcd_accel_threshold;
extern mp_size_t gcd_lehmer_threshold;
#endif
-#undef GCD_SCHOENHAGE_THRESHOLD
-#define GCD_SCHOENHAGE_THRESHOLD gcd_schoenhage_threshold
-extern mp_size_t gcd_schoenhage_threshold;
+#undef GCD_DC_THRESHOLD
+#define GCD_DC_THRESHOLD gcd_dc_threshold
+extern mp_size_t gcd_dc_threshold;
-#undef GCDEXT_SCHOENHAGE_THRESHOLD
-#define GCDEXT_SCHOENHAGE_THRESHOLD gcdext_schoenhage_threshold
-extern mp_size_t gcdext_schoenhage_threshold;
+#undef GCDEXT_DC_THRESHOLD
+#define GCDEXT_DC_THRESHOLD gcdext_dc_threshold
+extern mp_size_t gcdext_dc_threshold;
#undef DIVREM_1_NORM_THRESHOLD
#define DIVREM_1_NORM_THRESHOLD divrem_1_norm_threshold
diff --git a/mpn/Makefile.am b/mpn/Makefile.am
index 78f88e24c..d883ec2b8 100644
--- a/mpn/Makefile.am
+++ b/mpn/Makefile.am
@@ -40,7 +40,8 @@ nodist_EXTRA_libmpn_la_SOURCES = \
dump.c fib2_ui.c gcd.c \
gcd_finda.c gcd_1.c gcdext.c get_d.c get_str.c \
hamdist.c hgcd2.c hgcd.c invert_limb.c \
- ior_n.c iorn_n.c jacbase.c lshift.c mod_1.c mod_34lsub1.c mode1o.c \
+ ior_n.c iorn_n.c jacbase.c lshift.c \
+ matrix22_mul.c mod_1.c mod_34lsub1.c mode1o.c \
mul.c mul_1.c mul_2.c mul_3.c mul_4.c mul_fft.c mul_n.c mul_basecase.c \
mul_toom22.c mul_toom32.c mul_toom42.c \
mullow_n.c mullow_basecase.c nand_n.c nior_n.c perfsqr.c popcount.c \
@@ -72,5 +73,7 @@ mp_bases.c:
perfsqr.h:
cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/perfsqr.h
+tune-gcd-p: gcd.c
+ $(COMPILE) -DTUNE_GCD_P=1 gcd.c -o tune-gcd-p -L ../.libs -lgmp
include Makeasm.am
diff --git a/mpn/alpha/ev5/gmp-mparam.h b/mpn/alpha/ev5/gmp-mparam.h
index a58805781..9de9c07a2 100644
--- a/mpn/alpha/ev5/gmp-mparam.h
+++ b/mpn/alpha/ev5/gmp-mparam.h
@@ -41,10 +41,10 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 46
#define POWM_THRESHOLD 87
-#define HGCD_SCHOENHAGE_THRESHOLD 97
+#define HGCD_THRESHOLD 106
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 566
-#define GCDEXT_SCHOENHAGE_THRESHOLD 322
+#define GCD_DC_THRESHOLD 622
+#define GCDEXT_SCHOENHAGE_THRESHOLD 293
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
diff --git a/mpn/alpha/ev6/gmp-mparam.h b/mpn/alpha/ev6/gmp-mparam.h
index 33ea80a54..f259a2278 100644
--- a/mpn/alpha/ev6/gmp-mparam.h
+++ b/mpn/alpha/ev6/gmp-mparam.h
@@ -42,9 +42,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 116
#define POWM_THRESHOLD 212
-#define HGCD_SCHOENHAGE_THRESHOLD 407
+#define HGCD_THRESHOLD 407
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 867
+#define GCD_DC_THRESHOLD 867
#define GCDEXT_SCHOENHAGE_THRESHOLD 867
#define JACOBI_BASE_METHOD 1
diff --git a/mpn/alpha/ev6/nails/gmp-mparam.h b/mpn/alpha/ev6/nails/gmp-mparam.h
index 5d884e3bb..1bc93b52c 100644
--- a/mpn/alpha/ev6/nails/gmp-mparam.h
+++ b/mpn/alpha/ev6/nails/gmp-mparam.h
@@ -34,10 +34,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 48
#define POWM_THRESHOLD 113
-#define HGCD_SCHOENHAGE_THRESHOLD 78
+#define HGCD_THRESHOLD 78
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 392
-#define GCDEXT_THRESHOLD 0 /* always */
+#define GCD_DC_THRESHOLD 392
#define JACOBI_BASE_METHOD 1
#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
diff --git a/mpn/alpha/gmp-mparam.h b/mpn/alpha/gmp-mparam.h
index 138cc5438..37f700494 100644
--- a/mpn/alpha/gmp-mparam.h
+++ b/mpn/alpha/gmp-mparam.h
@@ -41,9 +41,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 38
#define POWM_THRESHOLD 53
-#define HGCD_SCHOENHAGE_THRESHOLD 63
+#define HGCD_THRESHOLD 63
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 476
+#define GCD_DC_THRESHOLD 476
#define GCDEXT_SCHOENHAGE_THRESHOLD 225
#define JACOBI_BASE_METHOD 2
diff --git a/mpn/arm/gmp-mparam.h b/mpn/arm/gmp-mparam.h
index a142605fb..80b6ff8ee 100644
--- a/mpn/arm/gmp-mparam.h
+++ b/mpn/arm/gmp-mparam.h
@@ -37,7 +37,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define POWM_THRESHOLD 150
#define GCD_ACCEL_THRESHOLD 3
-#define GCDEXT_THRESHOLD 0
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
diff --git a/mpn/cray/gmp-mparam.h b/mpn/cray/gmp-mparam.h
index b7da45c43..72dcb627d 100644
--- a/mpn/cray/gmp-mparam.h
+++ b/mpn/cray/gmp-mparam.h
@@ -41,10 +41,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 996
#define POWM_THRESHOLD 601
-#define HGCD_SCHOENHAGE_THRESHOLD 964
+#define HGCD_THRESHOLD 964
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 2874
-#define GCDEXT_THRESHOLD 6
+#define GCD_DC_THRESHOLD 2874
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
diff --git a/mpn/cray/ieee/gmp-mparam.h b/mpn/cray/ieee/gmp-mparam.h
index d5a866000..03d655c81 100644
--- a/mpn/cray/ieee/gmp-mparam.h
+++ b/mpn/cray/ieee/gmp-mparam.h
@@ -34,10 +34,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 390
#define POWM_THRESHOLD 656
-#define HGCD_SCHOENHAGE_THRESHOLD 964
+#define HGCD_THRESHOLD 964
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 964
-#define GCDEXT_THRESHOLD 0 /* always */
+#define GCD_DC_THRESHOLD 964
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
diff --git a/mpn/generic/gcd.c b/mpn/generic/gcd.c
index 30d6969a3..786c328f3 100644
--- a/mpn/generic/gcd.c
+++ b/mpn/generic/gcd.c
@@ -18,852 +18,255 @@ License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-/* Integer greatest common divisor of two unsigned integers, using
- the accelerated algorithm (see reference below).
-
- mp_size_t mpn_gcd (up, usize, vp, vsize).
-
- Preconditions [U = (up, usize) and V = (vp, vsize)]:
-
- 1. V is odd.
- 2. numbits(U) >= numbits(V).
-
- Both U and V are destroyed by the operation. The result is left at vp,
- and its size is returned.
-
- Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
-
- Funding for this work has been partially provided by Conselho Nacional
- de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
- 301314194-2, and was done while I was a visiting reseacher in the Instituto
- de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
-
- Refer to
- K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
- Mathematical Software, v. 21 (March), 1995, pp. 111-122. */
-
-#include <stdio.h> /* for NULL */
-
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
-
-/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
- algorithm is used, otherwise the binary algorithm is used. This may be
- adjusted for different architectures. */
-#ifndef GCD_ACCEL_THRESHOLD
-#define GCD_ACCEL_THRESHOLD 5
-#endif
-
-/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
- algorithm reduces using the bmod operation. Otherwise, the k-ary reduction
- is used. 0 <= BMOD_THRESHOLD < GMP_NUMB_BITS. */
-enum
- {
- BMOD_THRESHOLD = GMP_NUMB_BITS/2
- };
-
-
-/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
- Both U and V must be odd. */
-static inline mp_size_t
-gcd_2 (mp_ptr vp, mp_srcptr up)
+static inline int
+mpn_zero_p (mp_srcptr ap, mp_size_t n)
{
- mp_limb_t u0, u1, v0, v1;
- mp_size_t vsize;
-
- u0 = up[0];
- u1 = up[1];
- v0 = vp[0];
- v1 = vp[1];
-
- while (u1 != v1 && u0 != v0)
+ mp_size_t i;
+ for (i = n - 1; i >= 0; i--)
{
- unsigned long int r;
- if (u1 > v1)
- {
- u1 -= v1 + (u0 < v0);
- u0 = (u0 - v0) & GMP_NUMB_MASK;
- count_trailing_zeros (r, u0);
- u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r);
- u1 >>= r;
- }
- else /* u1 < v1. */
- {
- v1 -= u1 + (v0 < u0);
- v0 = (v0 - u0) & GMP_NUMB_MASK;
- count_trailing_zeros (r, v0);
- v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r);
- v1 >>= r;
- }
+ if (ap[i] != 0)
+ return 0;
}
-
- vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
-
- /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */
- if (u1 == v1 && u0 == v0)
- return vsize;
-
- v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
- vp[0] = mpn_gcd_1 (vp, vsize, v0);
-
return 1;
}
-/* The function find_a finds 0 < N < 2^GMP_NUMB_BITS such that there exists
- 0 < |D| < 2^GMP_NUMB_BITS, and N == D * C mod 2^(2*GMP_NUMB_BITS).
- In the reference article, D was computed along with N, but it is better to
- compute D separately as D <-- N / C mod 2^(GMP_NUMB_BITS + 1), treating
- the result as a twos' complement signed integer.
-
- Initialize N1 to C mod 2^(2*GMP_NUMB_BITS). According to the reference
- article, N2 should be initialized to 2^(2*GMP_NUMB_BITS), but we use
- 2^(2*GMP_NUMB_BITS) - N1 to start the calculations within double
- precision. If N2 > N1 initially, the first iteration of the while loop
- will swap them. In all other situations, N1 >= N2 is maintained. */
-
-#if HAVE_NATIVE_mpn_gcd_finda
-#define find_a(cp) mpn_gcd_finda (cp)
+/* Uses the HGCD operation described in
+
+ N. Möller, On Schönhage's algorithm and subquadratic integer gcd
+ computation, Math. Comp. 77 (2008), 589-607.
+
+ to reduce inputs until they are of size below GCD_DC_THRESHOLD, and
+ then uses Lehmer's algorithm.
+*/
+
+/* Some reasonable choices are n / 2 (same as in hgcd), and p = (n +
+ * 2)/3, which gives a balanced multiplication in
+ * mpn_hgcd_matrix_adjust. However, p = 2 n/3 gives slightly better
+ * performance. The matrix-vector multiplication is then
+ * 4:1-unbalanced, with matrix elements of size n/6, and vector
+ * elements of size p = 2n/3. */
+
+/* From analysis of the theoretical running time, it appears that when
+ * multiplication takes time O(n^alpha), p should be choosen so that
+ * the ratio of the time for the mpn_hgcd call, and the time for the
+ * multiplication in mpn_hgcd_matrix_adjust, is roughly 1/(alpha -
+ * 1). */
+#ifdef TUNE_GCD_P
+#define P_TABLE_SIZE 10000
+mp_size_t p_table[P_TABLE_SIZE];
+#define CHOOSE_P(n) ( (n) < P_TABLE_SIZE ? p_table[n] : 2*(n)/3)
#else
-static
-#if ! defined (__i386__)
-inline /* don't inline this for the x86 */
+#define CHOOSE_P(n) (2*(n) / 3)
#endif
-mp_limb_t
-find_a (mp_srcptr cp)
-{
- unsigned long int leading_zero_bits = 0;
- mp_limb_t n1_l = cp[0]; /* N1 == n1_h * 2^GMP_NUMB_BITS + n1_l. */
- mp_limb_t n1_h = cp[1];
-
- mp_limb_t n2_l = (-n1_l & GMP_NUMB_MASK); /* N2 == n2_h * 2^GMP_NUMB_BITS + n2_l. */
- mp_limb_t n2_h = (~n1_h & GMP_NUMB_MASK);
-
- /* Main loop. */
- while (n2_h != 0) /* While N2 >= 2^GMP_NUMB_BITS. */
- {
- /* N1 <-- N1 % N2. */
- if (((GMP_NUMB_HIGHBIT >> leading_zero_bits) & n2_h) == 0)
- {
- unsigned long int i;
- count_leading_zeros (i, n2_h);
- i -= GMP_NAIL_BITS;
- i -= leading_zero_bits;
- leading_zero_bits += i;
- n2_h = ((n2_h << i) & GMP_NUMB_MASK) | (n2_l >> (GMP_NUMB_BITS - i));
- n2_l = (n2_l << i) & GMP_NUMB_MASK;
- do
- {
- if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
- {
- n1_h -= n2_h + (n1_l < n2_l);
- n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
- }
- n2_l = (n2_l >> 1) | ((n2_h << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK);
- n2_h >>= 1;
- i -= 1;
- }
- while (i != 0);
- }
- if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
- {
- n1_h -= n2_h + (n1_l < n2_l);
- n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
- }
-
- MP_LIMB_T_SWAP (n1_h, n2_h);
- MP_LIMB_T_SWAP (n1_l, n2_l);
- }
-
- return n2_l;
-}
-#endif
-
-/* v must be odd */
-static mp_size_t
-gcd_binary_odd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
+mp_size_t
+mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t n)
{
- mp_ptr orig_vp = vp;
- mp_size_t orig_vsize = vsize;
- int binary_gcd_ctr; /* Number of times binary gcd will execute. */
+ mp_size_t talloc;
+ mp_size_t scratch;
+ mp_size_t matrix_scratch;
+
+ mp_size_t gn;
+ mp_ptr tp;
TMP_DECL;
- ASSERT (usize >= 1);
- ASSERT (vsize >= 1);
- ASSERT (usize >= vsize);
- ASSERT (vp[0] & 1);
- ASSERT (up[usize - 1] != 0);
- ASSERT (vp[vsize - 1] != 0);
-#if WANT_ASSERT
- if (usize == vsize)
+ /* FIXME: Check for small sizes first, before setting up temporary
+ storage etc. */
+ talloc = MPN_GCD_LEHMER_N_ITCH(n);
+
+ /* For initial division */
+ scratch = usize - n + 1;
+ if (scratch > talloc)
+ talloc = scratch;
+
+#if TUNE_GCD_P
+ if (CHOOSE_P (n) > 0)
+#else
+ if (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD))
+#endif
{
- int uzeros, vzeros;
- count_leading_zeros (uzeros, up[usize - 1]);
- count_leading_zeros (vzeros, vp[vsize - 1]);
- ASSERT (uzeros <= vzeros);
- }
+ mp_size_t hgcd_scratch;
+ mp_size_t update_scratch;
+ mp_size_t p = CHOOSE_P (n);
+ mp_size_t scratch;
+#if TUNE_GCD_P
+ /* Worst case, since we don't guarantee that n - CHOOSE_P(n)
+ is increasing */
+ matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n);
+ hgcd_scratch = mpn_hgcd_itch (n);
+ update_scratch = 2*(n - 1);
+#else
+ matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p);
+ hgcd_scratch = mpn_hgcd_itch (n - p);
+ update_scratch = p + n - 1;
#endif
- ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize));
- ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, up, usize));
- ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, vp, vsize));
+ scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
+ if (scratch > talloc)
+ talloc = scratch;
+ }
TMP_MARK;
+ tp = TMP_ALLOC_LIMBS(talloc);
- /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
- Two EXTRA limbs for U and V are required for kary reduction. */
- if (vsize >= GCD_ACCEL_THRESHOLD)
+ if (usize > n)
{
- unsigned long int vbitsize, d;
- mp_ptr orig_up = up;
- mp_size_t orig_usize = usize;
- mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
-
- MPN_COPY (anchor_up, orig_up, usize);
- up = anchor_up;
-
- count_leading_zeros (d, up[usize - 1]);
- d -= GMP_NAIL_BITS;
- d = usize * GMP_NUMB_BITS - d;
- count_leading_zeros (vbitsize, vp[vsize - 1]);
- vbitsize -= GMP_NAIL_BITS;
- vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
- ASSERT (d >= vbitsize);
- d = d - vbitsize + 1;
-
- /* Use bmod reduction to quickly discover whether V divides U. */
- up[usize++] = 0; /* Insert leading zero. */
- mpn_bdivmod (up, up, usize, vp, vsize, d);
-
- /* Now skip U/V mod 2^d and any low zero limbs. */
- d /= GMP_NUMB_BITS, up += d, usize -= d;
- while (usize != 0 && up[0] == 0)
- up++, usize--;
-
- if (usize == 0) /* GCD == ORIG_V. */
- goto done;
-
- vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
- MPN_COPY (vp, orig_vp, vsize);
-
- do /* Main loop. */
- {
- /* mpn_com_n can't be used here because anchor_up and up may
- partially overlap */
- if ((up[usize - 1] & GMP_NUMB_HIGHBIT) != 0) /* U < 0; take twos' compl. */
- {
- mp_size_t i;
- anchor_up[0] = -up[0] & GMP_NUMB_MASK;
- for (i = 1; i < usize; i++)
- anchor_up[i] = (~up[i] & GMP_NUMB_MASK);
- up = anchor_up;
- }
-
- MPN_NORMALIZE_NOT_ZERO (up, usize);
-
- if ((up[0] & 1) == 0) /* Result even; remove twos. */
- {
- unsigned int r;
- count_trailing_zeros (r, up[0]);
- mpn_rshift (anchor_up, up, usize, r);
- usize -= (anchor_up[usize - 1] == 0);
- }
- else if (anchor_up != up)
- MPN_COPY_INCR (anchor_up, up, usize);
-
- MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
- up = anchor_up;
-
- if (vsize <= 2) /* Kary can't handle < 2 limbs and */
- break; /* isn't efficient for == 2 limbs. */
+ mpn_tdiv_qr (tp, up, 0, up, usize, vp, n);
- d = vbitsize;
- count_leading_zeros (vbitsize, vp[vsize - 1]);
- vbitsize -= GMP_NAIL_BITS;
- vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
- d = d - vbitsize + 1;
-
- if (d > BMOD_THRESHOLD) /* Bmod reduction. */
- {
- up[usize++] = 0;
- mpn_bdivmod (up, up, usize, vp, vsize, d);
- d /= GMP_NUMB_BITS, up += d, usize -= d;
- }
- else /* Kary reduction. */
- {
- mp_limb_t bp[2], cp[2];
-
- /* C <-- V/U mod 2^(2*GMP_NUMB_BITS). */
- {
- mp_limb_t u_inv, hi, lo;
- modlimb_invert (u_inv, up[0]);
- cp[0] = (vp[0] * u_inv) & GMP_NUMB_MASK;
- umul_ppmm (hi, lo, cp[0], up[0] << GMP_NAIL_BITS);
- lo >>= GMP_NAIL_BITS;
- cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv & GMP_NUMB_MASK;
- }
-
- /* U <-- find_a (C) * U. */
- up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
- usize++;
-
- /* B <-- A/C == U/V mod 2^(GMP_NUMB_BITS + 1).
- bp[0] <-- U/V mod 2^GMP_NUMB_BITS and
- bp[1] <-- ( (U - bp[0] * V)/2^GMP_NUMB_BITS ) / V mod 2
-
- Like V/U above, but simplified because only the low bit of
- bp[1] is wanted. */
- {
- mp_limb_t v_inv, hi, lo;
- modlimb_invert (v_inv, vp[0]);
- bp[0] = (up[0] * v_inv) & GMP_NUMB_MASK;
- umul_ppmm (hi, lo, bp[0], vp[0] << GMP_NAIL_BITS);
- lo >>= GMP_NAIL_BITS;
- bp[1] = (up[1] + hi + (bp[0] & vp[1])) & 1;
- }
-
- up[usize++] = 0;
- if (bp[1] != 0) /* B < 0: U <-- U + (-B) * V. */
- {
- mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0] & GMP_NUMB_MASK);
- mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
- }
- else /* B >= 0: U <-- U - B * V. */
- {
- mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
- mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
- }
-
- up += 2, usize -= 2; /* At least two low limbs are zero. */
- }
-
- /* Must remove low zero limbs before complementing. */
- while (usize != 0 && up[0] == 0)
- up++, usize--;
+ if (mpn_zero_p (up, n))
+ {
+ MPN_COPY (gp, vp, n);
+ TMP_FREE;
+ return n;
}
- while (usize != 0);
-
- /* Compute GCD (ORIG_V, GCD (ORIG_U, V)). Binary will execute twice. */
- up = orig_up, usize = orig_usize;
- binary_gcd_ctr = 2;
}
- else
- binary_gcd_ctr = 1;
- /* Finish up with the binary algorithm. Executes once or twice. */
- for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
+#if TUNE_GCD_P
+ while (CHOOSE_P (n) > 0)
+#else
+ while (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD))
+#endif
{
- if (usize > 2) /* First make U close to V in size. */
+ struct hgcd_matrix M;
+ mp_size_t p = CHOOSE_P (n);
+ mp_size_t matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p);
+ mp_size_t nn;
+ mpn_hgcd_matrix_init (&M, n - p, tp);
+ nn = mpn_hgcd (up + p, vp + p, n - p, &M, tp + matrix_scratch);
+ if (nn > 0)
{
- unsigned long int vbitsize, d;
- count_leading_zeros (d, up[usize - 1]);
- d -= GMP_NAIL_BITS;
- d = usize * GMP_NUMB_BITS - d;
- count_leading_zeros (vbitsize, vp[vsize - 1]);
- vbitsize -= GMP_NAIL_BITS;
- vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
- d = d - vbitsize - 1;
- if (d != -(unsigned long int)1 && d > 2)
- {
- mpn_bdivmod (up, up, usize, vp, vsize, d); /* Result > 0. */
- d /= (unsigned long int)GMP_NUMB_BITS, up += d, usize -= d;
- }
+ ASSERT (M.n <= (n - p - 1)/2);
+ ASSERT (M.n + p <= (p + n - 1) / 2);
+ /* Temporary storage 2 (p + M->n) <= p + n - 1. */
+ n = mpn_hgcd_matrix_adjust (&M, p + nn, up, vp, p, tp + matrix_scratch);
}
-
- /* Start binary GCD. */
- do
+ else
{
- mp_size_t zeros;
-
- /* Make sure U is odd. */
- MPN_NORMALIZE (up, usize);
- while (up[0] == 0)
- up += 1, usize -= 1;
- if ((up[0] & 1) == 0)
- {
- unsigned int r;
- count_trailing_zeros (r, up[0]);
- mpn_rshift (up, up, usize, r);
- usize -= (up[usize - 1] == 0);
- }
-
- /* Keep usize >= vsize. */
- if (usize < vsize)
- MPN_PTR_SWAP (up, usize, vp, vsize);
-
- if (usize <= 2) /* Double precision. */
- {
- if (vsize == 1)
- vp[0] = mpn_gcd_1 (up, usize, vp[0]);
- else
- vsize = gcd_2 (vp, up);
- break; /* Binary GCD done. */
- }
-
- /* Count number of low zero limbs of U - V. */
- for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
- continue;
-
- /* If U < V, swap U and V; in any case, subtract V from U. */
- if (zeros == vsize) /* Subtract done. */
- up += zeros, usize -= zeros;
- else if (usize == vsize)
+ /* Temporary storage n */
+ n = mpn_gcd_subdiv_step (gp, &gn, up, vp, n, tp);
+ if (n == 0)
{
- mp_size_t size = vsize;
- do
- size--;
- while (up[size] == vp[size]);
- if (up[size] < vp[size]) /* usize == vsize. */
- MP_PTR_SWAP (up, vp);
- up += zeros, usize = size + 1 - zeros;
- mpn_sub_n (up, up, vp + zeros, usize);
- }
- else
- {
- mp_size_t size = vsize - zeros;
- up += zeros, usize -= zeros;
- if (mpn_sub_n (up, up, vp + zeros, size))
- {
- while (up[size] == 0) /* Propagate borrow. */
- up[size++] = -(mp_limb_t)1;
- up[size] -= 1;
- }
+ TMP_FREE;
+ return gn;
}
}
- while (usize); /* End binary GCD. */
}
-done:
- if (vp != gp)
- MPN_COPY_INCR (gp, vp, vsize);
+ gn = mpn_gcd_lehmer_n (gp, up, vp, n, tp);
TMP_FREE;
- return vsize;
+ return gn;
}
-#define EVEN_P(x) (((x) & 1) == 0)
-
-/* Allows an even v */
-static mp_size_t
-gcd_binary (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
+#ifdef TUNE_GCD_P
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#define TIME(res, code) do { \
+ clock_t time_start; \
+ clock_t time_end; \
+ clock_t time_end_time; \
+ unsigned time_iter = 0; \
+ \
+ time_start = clock(); \
+ time_end_time = time_start + CLOCKS_PER_SEC / 100; \
+ do \
+ { \
+ code; \
+ time_end = clock(); \
+ time_iter++; \
+ } \
+ while (time_end <= time_end_time); \
+ \
+ (res) = (double) (time_end - time_start) / (CLOCKS_PER_SEC * time_iter); \
+ } while (0)
+
+int
+main(int argc, char *argv)
{
- mp_size_t zero_words = 0;
- mp_size_t gsize;
- unsigned shift = 0;
-
- ASSERT (usize > 0);
- ASSERT (vsize > 0);
+ gmp_randstate_t rands;
+ mp_size_t n;
+ mp_ptr ap;
+ mp_ptr bp;
+ mp_ptr up;
+ mp_ptr vp;
+ mp_ptr gp;
+ mp_ptr tp;
+ TMP_DECL;
- if (up[0] == 0 && vp[0] == 0)
- {
- do
- gp[zero_words++] = 0;
- while (up[zero_words] == 0 && vp[zero_words] == 0);
+ /* Unbuffered so if output is redirected to a file it isn't lost if the
+ program is killed part way through. */
+ setbuf (stdout, NULL);
+ setbuf (stderr, NULL);
- up += zero_words; usize -= zero_words;
- vp += zero_words; vsize -= zero_words;
- gp += zero_words;
- }
+ gmp_randinit_default (rands);
- /* Now u and v can have a common power of two < 2^GMP_NUMB_BITS */
- if (up[0] == 0)
- {
- ASSERT (vp[0] != 0);
- if (EVEN_P (vp[0]))
- {
- count_trailing_zeros (shift, vp[0]);
- ASSERT (shift > 0);
- ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, shift));
- if (vp[vsize - 1] == 0)
- vsize--;
- }
- }
- else if (vp[0] == 0)
- {
- if (EVEN_P (up[0]))
- {
- count_trailing_zeros (shift, up[0]);
- ASSERT (shift > 0);
- }
- while (vp[0] == 0)
- {
- vp++;
- vsize--;
- }
-
- if (EVEN_P (vp[0]))
- {
- unsigned vcount;
-
- count_trailing_zeros (vcount, vp[0]);
- ASSERT (vcount > 0);
- ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, vcount));
- if (vp[vsize - 1] == 0)
- vsize--;
- }
- }
- else if (EVEN_P (vp[0]))
- {
- unsigned vcount;
- count_trailing_zeros (vcount, vp[0]);
- ASSERT (vcount > 0);
- ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, vcount));
- if (vp[vsize - 1] == 0)
- vsize--;
-
- if (EVEN_P (up[0]))
- {
- unsigned ucount;
- count_trailing_zeros (ucount, up[0]);
- ASSERT (ucount > 0);
- shift = MIN (ucount, vcount);
- }
- }
+ TMP_MARK;
- gsize = gcd_binary_odd (gp, up, usize, vp, vsize);
- if (shift)
+ ap = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+ bp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+ up = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+ vp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+ gp = TMP_ALLOC_LIMBS (P_TABLE_SIZE);
+ tp = TMP_ALLOC_LIMBS (MPN_GCD_LEHMER_N_ITCH (P_TABLE_SIZE));
+
+ mpn_random (ap, P_TABLE_SIZE);
+ mpn_random (bp, P_TABLE_SIZE);
+
+ memset (p_table, 0, sizeof(p_table));
+
+ for (n = 10; n++; n < P_TABLE_SIZE)
{
- mp_limb_t cy = mpn_lshift (gp, gp, gsize, shift);
- if (cy)
- gp[gsize++] = cy;
- }
- return gsize + zero_words;
-}
-
-#define MPN_LEQ_P(ap, asize, bp, bsize) \
-((asize) < (bsize) || ((asize) == (bsize) \
- && mpn_cmp ((ap), (bp), (asize)) <= 0))
-
-/* Sets (a, b, c, d) <-- (c, d, a, b) */
-#define NHGCD_SWAP4_2(row) \
-do { \
- struct hgcd_row __nhgcd_swap4_2_tmp; \
- __nhgcd_swap4_2_tmp = row[0]; \
- row[0] = row[2]; \
- row[2] = __nhgcd_swap4_2_tmp; \
- __nhgcd_swap4_2_tmp = row[1]; \
- row[1] = row[3]; \
- row[3] = __nhgcd_swap4_2_tmp; \
-} while (0)
-
-/* Sets (a, b, c) <-- (b, c, a) */
-#define NHGCD_SWAP3_LEFT(row) \
-do { \
- struct hgcd_row __nhgcd_swap4_left_tmp; \
- __nhgcd_swap4_left_tmp = row[0]; \
- row[0] = row[1]; \
- row[1] = row[2]; \
- row[2] = __nhgcd_swap4_left_tmp; \
-} while (0)
-
-static mp_size_t
-hgcd_tdiv (mp_ptr qp,
- mp_ptr rp, mp_size_t *rsizep,
- mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize)
-{
- mp_size_t qsize;
- mp_size_t rsize;
+ mp_size_t p;
+ mp_size_t best_p;
+ double best_time;
+ double lehmer_time;
- mpn_tdiv_qr (qp, rp, 0, ap, asize, bp, bsize);
+ if (ap[n-1] == 0)
+ ap[n-1] = 1;
- rsize = bsize;
- MPN_NORMALIZE (rp, rsize);
- *rsizep = rsize;
+ if (bp[n-1] == 0)
+ bp[n-1] = 1;
- qsize = asize - bsize + 1;
- qsize -= (qp[qsize - 1] == 0);
+ p_table[n] = 0;
+ TIME(lehmer_time, {
+ MPN_COPY (up, ap, n);
+ MPN_COPY (vp, bp, n);
+ mpn_gcd_lehmer_n (gp, up, vp, n, tp);
+ });
- if (qsize == 1 && qp[0] == 1)
- return 0;
-
- return qsize;
-}
-
-
-#if 0
-#define GCD_LEHMER_ITCH(asize) (5*((asize) + 1))
-
-static mp_size_t
-gcd_lehmer (mp_ptr gp, mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize,
- mp_ptr tp, mp_size_t talloc)
-{
- struct hgcd_row r[4];
- mp_ptr qp;
- mp_size_t qsize;
- mp_size_t ralloc = asize + 1;
+ best_time = lehmer_time;
+ best_p = 0;
- ASSERT (asize >= bsize);
- ASSERT (bsize > 0);
-
-#if 0
- if (BELOW_THRESHOLD (asize, MPN_GCD_LEHMER_THRESHOLD))
- {
- ASSERT (asize + bsize + 2 <= talloc);
-
- MPN_COPY (tp, ap, asize);
- MPN_COPY (tp + asize + 1, bp, bsize);
- return nhgcd_gcd_binary (gp, tp, asize, tp + asize + 1, bsize);
- }
-#endif
-
- ASSERT (MPN_LEQ_P (bp, bsize, ap, asize));
- ASSERT (5 * asize + 4 <= talloc);
-
- r[0].rp = tp; tp += ralloc; talloc -= ralloc;
- r[1].rp = tp; tp += ralloc; talloc -= ralloc;
- r[2].rp = tp; tp += ralloc; talloc -= ralloc;
- r[3].rp = tp; tp += ralloc; talloc -= ralloc;
- qp = tp; tp += asize; talloc -= asize;
-
- MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize;
- MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize;
-
-#if 0
- /* u and v fields aren't used, but zero them out so that we can call
- trace_nhgcd_row */
- r[0].uvp[0] = r[0].uvp[1] = NULL;
- r[1].uvp[0] = r[1].uvp[1] = NULL;
- r[2].uvp[0] = r[2].uvp[1] = NULL;
- r[3].uvp[0] = r[3].uvp[1] = NULL;
-#endif
-
- while (ABOVE_THRESHOLD (r[0].rsize, GCD_LEHMER_THRESHOLD) && r[1].rsize > 0)
- {
- struct hgcd2 hgcd;
- int res = mpn_hgcd2_lehmer_step (&hgcd,
- r[0].rp, r[0].rsize,
- r[1].rp, r[1].rsize,
- NULL);
-
- if (!res || (res == 2 && hgcd.row[0].v == 0))
+ for (p = 1; p < n; p += (n+9)/10)
{
- qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize,
- r[0].rp, r[0].rsize,
- r[1].rp, r[1].rsize);
- NHGCD_SWAP3_LEFT (r);
- }
- else
- {
- const struct hgcd2_row *s = hgcd.row + (res - 2);
- int sign = hgcd.sign;
- if (res == 3)
- sign = ~sign;
-
- /* s[0] and s[1] correct. */
- r[2].rsize
- = mpn_hgcd2_fix (r[2].rp, ralloc,
- sign,
- s[0].u, r[0].rp, r[0].rsize,
- s[0].v, r[1].rp, r[1].rsize);
-
- r[3].rsize
- = mpn_hgcd2_fix (r[3].rp, ralloc,
- ~sign,
- s[1].u, r[0].rp, r[0].rsize,
- s[1].v, r[1].rp, r[1].rsize);
-
- NHGCD_SWAP4_2 (r);
- }
- }
-
- if (r[1].rsize == 0)
- {
- MPN_COPY (gp, r[0].rp, r[0].rsize);
- return r[0].rsize;
- }
+ double t;
- return gcd_binary (gp, r[0].rp, r[0].rsize, r[1].rp, r[1].rsize);
-}
-#endif
-
-static mp_size_t
-gcd_schoenhage_itch (mp_size_t asize)
-{
- /* Size for hgcd calls */
- mp_size_t ralloc = asize + 1;
- mp_size_t hgcd_size = (asize + 1) / 2;
- return (4 * ralloc /* Remainder storage */
- + mpn_hgcd_init_itch (hgcd_size) /* hgcd storage */
- + qstack_itch (hgcd_size)
- + mpn_hgcd_itch (hgcd_size) /* nhgcd call */
- + 1+ 3 * asize / 4); /* hgcd_fix */
-}
+ p_table[n] = p;
+ TIME(t, {
+ MPN_COPY (up, ap, n);
+ MPN_COPY (vp, bp, n);
+ mpn_gcd (gp, up, n, vp, n);
+ });
-static mp_size_t
-gcd_schoenhage (mp_ptr gp, mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize,
- mp_ptr tp, mp_size_t talloc)
-{
- mp_size_t scratch;
- struct hgcd hgcd;
- struct qstack quotients;
- struct hgcd_row r[4];
-
- mp_size_t ralloc = asize + 1;
-
- ASSERT (asize >= bsize);
- ASSERT (bsize > 0);
-
- ASSERT (MPN_LEQ_P (bp, bsize, ap, asize));
-
- ASSERT (4 * ralloc <= talloc);
- tp += ralloc; talloc -= ralloc;
- r[0].rp = tp; tp += ralloc; talloc -= ralloc;
- r[1].rp = tp; tp += ralloc; talloc -= ralloc;
- r[2].rp = tp; tp += ralloc; talloc -= ralloc;
- r[3].rp = tp; tp += ralloc; talloc -= ralloc;
-
- MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize;
- MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize;
-
-#if 0
- /* We don't use the u and v fields, but zero them out so that we can
- call trace_nhgcd_row while debugging. */
- r[0].uvp[0] = r[0].uvp[1] = NULL;
- r[1].uvp[0] = r[1].uvp[1] = NULL;
- r[2].uvp[0] = r[2].uvp[1] = NULL;
- r[3].uvp[0] = r[3].uvp[1] = NULL;
-#endif
-
- scratch = mpn_hgcd_init_itch ((asize + 1)/2);
- ASSERT (scratch <= talloc);
- mpn_hgcd_init (&hgcd, (asize + 1)/2, tp);
- tp += scratch; talloc -= scratch;
-
- {
- mp_size_t nlimbs = qstack_itch ((asize + 1)/2);
-
- ASSERT (nlimbs <= talloc);
-
- qstack_init (&quotients, (asize + 1) / 2, tp, nlimbs);
-
- tp += nlimbs;
- talloc -= nlimbs;
- }
-
- while (ABOVE_THRESHOLD (r[0].rsize, GCD_SCHOENHAGE_THRESHOLD)
- && r[1].rsize > 0)
- {
- mp_size_t k = r[0].rsize / 2;
- int res;
-
-#if 0
- trace ("nhgcd_gcd_schoenhage\n");
- trace_nhgcd_row (r);
- trace_nhgcd_row (r + 1);
-#endif
- if (r[1].rsize <= k)
- goto euclid;
-
- qstack_reset (&quotients, r[0].rsize - k);
-
- res = mpn_hgcd (&hgcd,
- r[0].rp + k, r[0].rsize - k,
- r[1].rp + k, r[1].rsize - k,
- &quotients,
- tp, talloc);
-
- if (res == 0 || res == 1)
- {
- euclid:
- ASSERT (r[0].rsize - r[1].rsize + 1 <= talloc);
- hgcd_tdiv (tp, r[2].rp, &r[2].rsize,
- r[0].rp, r[0].rsize,
- r[1].rp, r[1].rsize);
-
- NHGCD_SWAP3_LEFT (r);
- }
- else
- {
- const struct hgcd_row *s = hgcd.row + (res - 2);
- int sign = hgcd.sign;
- if (res == 3)
- sign = ~sign;
-
- /* s[0] and s[1] are correct */
- r[2].rsize
- = mpn_hgcd_fix (k, r[2].rp, ralloc,
- sign, hgcd.size, s,
- r[0].rp, r[1].rp,
- tp, talloc);
-
- r[3].rsize
- = mpn_hgcd_fix (k, r[3].rp, ralloc,
- ~sign, hgcd.size, s+1,
- r[0].rp, r[1].rp,
- tp, talloc);
-
- NHGCD_SWAP4_2 (r);
+ if (t < best_time)
+ {
+ best_time = t;
+ best_p = p;
+ }
}
- }
+ printf("%6d %6d %5.3g", n, best_p, (double) best_p / n);
+ if (best_p > 0)
+ printf(" %5.3g%%", 100 * (lehmer_time - best_time) / lehmer_time);
+ printf("\n");
-#if 0
- trace ("nhgcd_gcd_schoenhage after loop\n");
- trace_nhgcd_row (r);
- trace_nhgcd_row (r + 1);
-#endif
-
- if (r[1].rsize == 0)
- {
- MPN_COPY (gp, r[0].rp, r[0].rsize);
- return r[0].rsize;
- }
-#if 0
- else if (ABOVE_THRESHOLD (r[0].rsize, GCD_LEHMER_THRESHOLD))
- return gcd_lehmer (gp,
- r[0].rp, r[0].rsize,
- r[1].rp, r[1].rsize,
- tp, talloc);
-#endif
- else
- return gcd_binary (gp,
- r[0].rp, r[0].rsize,
- r[1].rp, r[1].rsize);
-}
-
-/* Should we perform an initial division? */
-mp_size_t
-mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
-{
- if (BELOW_THRESHOLD (usize, GCD_SCHOENHAGE_THRESHOLD))
- return gcd_binary_odd (gp, up, usize, vp, vsize);
-
- /* The algorithms below require U >= V, while mpn_gcd is long documented as
- requiring only that the position of U's msb >= V's msb. */
- if (usize == vsize && mpn_cmp (up, vp, usize) < 0)
- MP_PTR_SWAP (up, vp);
-
-#if 0
- if (BELOW_THRESHOLD (usize, GCD_SCHOENHAGE_THRESHOLD))
- {
- mp_size_t scratch;
- mp_ptr tp;
- mp_size_t gsize;
- TMP_DECL;
-
- TMP_MARK;
-
- scratch = GCD_LEHMER_ITCH (usize);
- tp = TMP_ALLOC_LIMBS (scratch);
-
- gsize = gcd_lehmer (gp, up, usize, vp, vsize, tp, scratch);
- TMP_FREE;
- return gsize;
- }
- else
-#endif
- {
- mp_size_t scratch;
- mp_ptr tp;
- mp_size_t gsize;
-
- scratch = gcd_schoenhage_itch (usize);
- tp = __GMP_ALLOCATE_FUNC_LIMBS (scratch);
-
- gsize = gcd_schoenhage (gp, up, usize, vp, vsize, tp, scratch);
- __GMP_FREE_FUNC_LIMBS (tp, scratch);
- return gsize;
+ p_table[n] = best_p;
}
+ TMP_FREE;
+ gmp_randclear(rands);
+ return 0;
}
+#endif /* TUNE_GCD_P */
diff --git a/mpn/generic/gcd_lehmer.c b/mpn/generic/gcd_lehmer.c
new file mode 100644
index 000000000..42a7ddefc
--- /dev/null
+++ b/mpn/generic/gcd_lehmer.c
@@ -0,0 +1,161 @@
+/* gcd_lehmer.c.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
+ SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
+ GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Use binary algorithm to compute G <-- GCD (U, V) for usize, vsize == 2.
+ Both U and V must be odd. */
+static inline mp_size_t
+gcd_2 (mp_ptr gp, mp_srcptr up, mp_srcptr vp)
+{
+ mp_limb_t u0, u1, v0, v1;
+ mp_size_t gn;
+
+ u0 = up[0];
+ u1 = up[1];
+ v0 = vp[0];
+ v1 = vp[1];
+
+ ASSERT (u0 & 1);
+ ASSERT (v0 & 1);
+
+ /* Check for u0 != v0 needed to ensure that argument to
+ * count_trailing_zeros is non-zero. */
+ while (u1 != v1 && u0 != v0)
+ {
+ unsigned long int r;
+ if (u1 > v1)
+ {
+ u1 -= v1 + (u0 < v0);
+ u0 = (u0 - v0) & GMP_NUMB_MASK;
+ count_trailing_zeros (r, u0);
+ u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r);
+ u1 >>= r;
+ }
+ else /* u1 < v1. */
+ {
+ v1 -= u1 + (v0 < u0);
+ v0 = (v0 - u0) & GMP_NUMB_MASK;
+ count_trailing_zeros (r, v0);
+ v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r);
+ v1 >>= r;
+ }
+ }
+
+ gp[0] = u0, gp[1] = u1, gn = 1 + (u1 != 0);
+
+ /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */
+ if (u1 == v1 && u0 == v0)
+ return gn;
+
+ v0 = (u0 == v0) ? ((u1 > v1) ? u1-v1 : v1-u1) : ((u0 > v0) ? u0-v0 : v0-u0);
+ gp[0] = mpn_gcd_1 (gp, gn, v0);
+
+ return 1;
+}
+
+/* Temporary storage: n */
+mp_size_t
+mpn_gcd_lehmer_n (mp_ptr gp, mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
+{
+ mp_size_t scratch;
+
+ /* Relax this requirement, and normalize at the start? Must disallow
+ A = B = 0, though. */
+ ASSERT(ap[n-1] > 0 || bp[n-1] > 0);
+
+ while (n > 2)
+ {
+ struct hgcd_matrix1 M;
+ mp_limb_t ah, al, bh, bl;
+ mp_limb_t mask;
+
+ mask = ap[n-1] | bp[n-1];
+ ASSERT (mask > 0);
+
+ if (mask & GMP_NUMB_HIGHBIT)
+ {
+ ah = ap[n-1]; al = ap[n-2];
+ bh = bp[n-1]; bl = bp[n-2];
+ }
+ else
+ {
+ int shift;
+
+ count_leading_zeros (shift, mask);
+ ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+ al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+ bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+ bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
+ }
+
+ /* Try an mpn_nhgcd2 step */
+ if (mpn_hgcd2 (ah, al, bh, bl, &M))
+ /* Temporary storage n */
+ n = mpn_hgcd_mul_matrix1_inverse_vector (&M, n, ap, bp, tp);
+
+ else
+ {
+ /* mpn_hgcd2 has failed. Then either one of a or b is very
+ small, or the difference is very small. Perform one
+ subtraction followed by one division. */
+ mp_size_t gn;
+
+ /* Temporary storage n */
+ n = mpn_gcd_subdiv_step (gp, &gn, ap, bp, n, tp);
+ if (n == 0)
+ return gn;
+ }
+ }
+
+ if (n == 1)
+ {
+ *gp = mpn_gcd_1(ap, 1, bp[0]);
+ return 1;
+ }
+
+ /* Due to the calling convention for mpn_gcd, at most one can be
+ even. */
+
+ if (! (ap[0] & 1))
+ MP_PTR_SWAP (ap, bp);
+
+ ASSERT (ap[0] & 1);
+
+ if (bp[0] == 0)
+ {
+ *gp = mpn_gcd_1 (ap, 2, bp[1]);
+ return 1;
+ }
+ else if (! (bp[0] & 1))
+ {
+ int r;
+ count_trailing_zeros (r, bp[0]);
+ bp[0] = ((bp[1] << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (bp[0] >> r);
+ bp[1] >>= r;
+ }
+
+ return gcd_2(gp, ap, bp);
+}
diff --git a/mpn/generic/gcd_subdiv_step.c b/mpn/generic/gcd_subdiv_step.c
new file mode 100644
index 000000000..d9708e8e1
--- /dev/null
+++ b/mpn/generic/gcd_subdiv_step.c
@@ -0,0 +1,116 @@
+/* gcd_subdiv_step.c.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
+ SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
+ GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static inline int
+mpn_zero_p (mp_srcptr ap, mp_size_t n)
+{
+ mp_size_t i;
+ for (i = n - 1; i >= 0; i--)
+ {
+ if (ap[i] != 0)
+ return 0;
+ }
+ return 1;
+}
+
+/* Used when mpn_hgcd or mpn_hgcd2 has failed. Then either one of a or
+ b is small, or the difference is small. Perform one subtraction
+ followed by one division. If the gcd is found, stores it in gp and
+ *gn, and returns zero. Otherwise, compute the reduced a and b, and
+ return the new size. */
+
+/* FIXME: Check when the smaller number is a single limb, and invoke
+ * mpn_gcd_1. */
+mp_size_t
+mpn_gcd_subdiv_step (mp_ptr gp, mp_size_t *gn,
+ mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
+{
+ mp_size_t an, bn;
+
+ ASSERT (n > 0);
+ ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
+
+ an = bn = n;
+ MPN_NORMALIZE (ap, an);
+ MPN_NORMALIZE (bp, bn);
+
+ if (UNLIKELY (an == 0))
+ {
+ return_b:
+ MPN_COPY (gp, bp, bn);
+ *gn = bn;
+ return 0;
+ }
+ else if (UNLIKELY (bn == 0))
+ {
+ return_a:
+ MPN_COPY (gp, ap, an);
+ *gn = an;
+ return 0;
+ }
+
+ /* Arrange so that a > b, subtract an -= bn, and maintain
+ normalization. */
+ if (an < bn)
+ MPN_PTR_SWAP (ap, an, bp, bn);
+ else if (an == bn)
+ {
+ int c;
+ MPN_CMP (c, ap, bp, an);
+ if (UNLIKELY (c == 0))
+ goto return_a;
+ else if (c < 0)
+ MP_PTR_SWAP (ap, bp);
+ }
+
+ ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn));
+ MPN_NORMALIZE (ap, an);
+ ASSERT (an > 0);
+
+ /* Arrange so that a > b, and divide a = q b + r */
+ /* FIXME: an < bn happens when we have cancellation. If that is the
+ common case, then we could reverse the roles of a and b to avoid
+ the swap. */
+ if (an < bn)
+ MPN_PTR_SWAP (ap, an, bp, bn);
+ else if (an == bn)
+ {
+ int c;
+ MPN_CMP (c, ap, bp, an);
+ if (UNLIKELY (c == 0))
+ goto return_a;
+ else if (c < 0)
+ MP_PTR_SWAP (ap, bp);
+ }
+
+ mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
+
+ if (mpn_zero_p (ap, bn))
+ goto return_b;
+
+ return bn;
+}
diff --git a/mpn/generic/gcdext.c b/mpn/generic/gcdext.c
index 63528f98e..94d490791 100644
--- a/mpn/generic/gcdext.c
+++ b/mpn/generic/gcdext.c
@@ -18,819 +18,101 @@ License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define WANT_TRACE 0
-
-/* Default to binary gcdext_1, since it is best on most current machines.
- We should teach tuneup to choose the right gcdext_1. */
-#define GCDEXT_1_USE_BINARY 1
-
-#if WANT_TRACE
-# include <stdio.h>
-# include <stdarg.h>
-#endif
-
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
-#ifndef NULL
-# define NULL ((void *) 0)
-#endif
-
-#if WANT_TRACE
-static void
-trace (const char *format, ...)
-{
- va_list args;
- va_start (args, format);
- gmp_vfprintf (stderr, format, args);
- va_end (args);
-}
-#endif
-
-/* Comparison of _normalized_ numbers. */
-
-#define MPN_EQUAL_P(ap, asize, bp, bsize) \
-((asize) == (bsize) && mpn_cmp ((ap), (bp), (asize)) == 0)
-
-#define MPN_LEQ_P(ap, asize, bp, bsize) \
-((asize) < (bsize) || ((asize) == (bsize) \
- && mpn_cmp ((ap), (bp), (asize)) <= 0))
-
-/* Returns g, u and v such that g = u A - v B. There are three
- different cases for the result:
-
- g = u A - v B, 0 < u < b, 0 < v < a
- g = A u = 1, v = 0
- g = B u = B, v = A - 1
-
- We always return with 0 < u <= b, 0 <= v < a.
-*/
-#if GCDEXT_1_USE_BINARY
-
-static mp_limb_t
-gcdext_1_odd (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
+static inline int
+mpn_zero_p (mp_srcptr ap, mp_size_t n)
{
- mp_limb_t u0;
- mp_limb_t v0;
- mp_limb_t v1;
- mp_limb_t u1;
-
- mp_limb_t B = b;
- mp_limb_t A = a;
-
- /* Through out this function maintain
-
- a = u0 A - v0 B
- b = u1 A - v1 B
-
- where A and B are odd. */
-
- u0 = 1; v0 = 0;
- u1 = b; v1 = a-1;
-
- if (A == 1)
- {
- *up = u0; *vp = v0;
- return 1;
- }
- else if (B == 1)
- {
- *up = u1; *vp = v1;
- return 1;
- }
-
- while (a != b)
- {
- mp_limb_t mask;
-
- ASSERT (a % 2 == 1);
- ASSERT (b % 2 == 1);
-
- ASSERT (0 < u0); ASSERT (u0 <= B);
- ASSERT (0 < u1); ASSERT (u1 <= B);
-
- ASSERT (0 <= v0); ASSERT (v0 < A);
- ASSERT (0 <= v1); ASSERT (v1 < A);
-
- if (a > b)
- {
- MP_LIMB_T_SWAP (a, b);
- MP_LIMB_T_SWAP (u0, u1);
- MP_LIMB_T_SWAP (v0, v1);
- }
-
- ASSERT (a < b);
-
- /* Makes b even */
- b -= a;
-
- mask = - (mp_limb_t) (u1 < u0);
- u1 += B & mask;
- v1 += A & mask;
- u1 -= u0;
- v1 -= v0;
-
- ASSERT (b % 2 == 0);
-
- do
- {
- /* As b = u1 A + v1 B is even, while A and B are odd,
- either both or none of u1, v1 is even */
-
- ASSERT (u1 % 2 == v1 % 2);
-
- mask = -(u1 & 1);
- u1 = u1 / 2 + ((B / 2) & mask) - mask;
- v1 = v1 / 2 + ((A / 2) & mask) - mask;
-
- b /= 2;
- }
- while (b % 2 == 0);
- }
-
- /* Now g = a = b */
- ASSERT (a == b);
- ASSERT (u1 <= B);
- ASSERT (v1 < A);
-
- ASSERT (A % a == 0);
- ASSERT (B % a == 0);
- ASSERT (u0 % (B/a) == u1 % (B/a));
- ASSERT (v0 % (A/a) == v1 % (A/a));
-
- *up = u0; *vp = v0;
-
- return a;
-}
-
-static mp_limb_t
-gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
-{
- unsigned shift = 0;
- mp_limb_t g;
- mp_limb_t u;
- mp_limb_t v;
-
- /* We use unsigned values in the range 0, ... B - 1. As the values
- are uniquely determined only modulo B, we can add B at will, to
- get numbers in range or flip the least significant bit. */
- /* Deal with powers of two */
- while ((a | b) % 2 == 0)
- {
- a /= 2; b /= 2; shift++;
- }
-
- if (b % 2 == 0)
- {
- unsigned k = 0;
-
- do {
- b /= 2; k++;
- } while (b % 2 == 0);
-
- g = gcdext_1_odd (&u, &v, a, b);
-
- while (k--)
- {
- /* We have g = u a + v b, and need to construct
- g = u'a + v'(2b).
-
- If v is even, we can just set u' = u, v' = v/2
- If v is odd, we can set v' = (v + a)/2, u' = u + b
- */
-
- if (v % 2 == 0)
- v /= 2;
- else
- {
- u = u + b;
- v = v/2 + a/2 + 1;
- }
- b *= 2;
- }
- }
- else if (a % 2 == 0)
- {
- unsigned k = 0;
-
- do {
- a /= 2; k++;
- } while (a % 2 == 0);
-
- g = gcdext_1_odd (&u, &v, a, b);
-
- while (k--)
- {
- /* We have g = u a + v b, and need to construct
- g = u'(2a) + v'b.
-
- If u is even, we can just set u' = u/2, v' = v.
- If u is odd, we can set u' = (u + b)/2
- */
-
- if (u % 2 == 0)
- u /= 2;
- else
- {
- u = u/2 + b/2 + 1;
- v = v + a;
- }
- a *= 2;
- }
- }
- else
- /* Ok, both are odd */
- g = gcdext_1_odd (&u, &v, a, b);
-
- *up = u;
- *vp = v;
-
- return g << shift;
-}
-
-#else /* ! GCDEXT_1_USE_BINARY */
-static mp_limb_t
-gcdext_1_u (mp_limb_t *up, mp_limb_t a, mp_limb_t b)
-{
- /* Maintain
-
- a = u0 A mod B
- b = - u1 A mod B
- */
- mp_limb_t u0 = 1;
- mp_limb_t u1 = 0;
- mp_limb_t B = b;
-
- ASSERT (a >= b);
- ASSERT (b > 0);
-
- for (;;)
+ mp_size_t i;
+ for (i = n - 1; i >= 0; i--)
{
- mp_limb_t q;
-
- q = a / b;
- a -= q * b;
-
- if (a == 0)
- {
- *up = B - u1;
- return b;
- }
- u0 += q * u1;
-
- q = b / a;
- b -= q * a;
-
- if (b == 0)
- {
- *up = u0;
- return a;
- }
- u1 += q * u0;
+ if (ap[i] != 0)
+ return 0;
}
+ return 1;
}
-static mp_limb_t
-gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
-{
- /* Maintain
-
- a = u0 A - v0 B
- b = - u1 A + v1 B = (B - u1) A - (A - v1) B
- */
- mp_limb_t u0 = 1;
- mp_limb_t v0 = 0;
- mp_limb_t u1 = 0;
- mp_limb_t v1 = 1;
-
- mp_limb_t A = a;
- mp_limb_t B = b;
-
- ASSERT (a >= b);
- ASSERT (b > 0);
-
- for (;;)
- {
- mp_limb_t q;
-
- q = a / b;
- a -= q * b;
-
- if (a == 0)
- {
- *up = B - u1;
- *vp = A - v1;
- return b;
- }
- u0 += q * u1;
- v0 += q * v1;
+/* Computes r = u0 x0 + u1 x1. Needs n = un + xn limbs of temporary
+ storage. Result is of size n-1, n or n+1, and the size is returned
+ (if inputs are non-normalized, result may be non-normalized too).
- q = b / a;
- b -= q * a;
+ No overlap between input and output is allowed, since rp is used
+ for temporary storage. */
- if (b == 0)
- {
- *up = u0;
- *vp = v0;
- return a;
- }
- u1 += q * u0;
- v1 += q * v0;
- }
-}
-#endif /* ! GCDEXT_1_USE_BINARY */
-
-/* FIXME: Duplicated in gcd.c */
static mp_size_t
-hgcd_tdiv (mp_ptr qp,
- mp_ptr rp, mp_size_t *rsizep,
- mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize)
+addmul2_n (mp_ptr rp,
+ mp_srcptr u0, mp_srcptr u1, mp_size_t un,
+ mp_srcptr x0, mp_srcptr x1, mp_size_t xn,
+ mp_ptr tp)
{
- mp_size_t qsize;
- mp_size_t rsize;
-
- mpn_tdiv_qr (qp, rp, 0, ap, asize, bp, bsize);
-
- rsize = bsize;
- MPN_NORMALIZE (rp, rsize);
- *rsizep = rsize;
-
- qsize = asize - bsize + 1;
- qsize -= (qp[qsize - 1] == 0);
-
- if (qsize == 1 && qp[0] == 1)
- return 0;
-
- return qsize;
-}
-
-/* FIXME: Duplicated in hgcd.c */
-static mp_limb_t
-mpn_addmul2_n_1 (mp_ptr rp, mp_size_t n,
- mp_ptr ap, mp_limb_t u,
- mp_ptr bp, mp_limb_t v)
-{
- mp_limb_t h;
mp_limb_t cy;
+ mp_size_t n;
- h = mpn_mul_1 (rp, ap, n, u);
- cy = mpn_addmul_1 (rp, bp, n, v);
- h += cy;
-#if GMP_NAIL_BITS == 0
- rp[n] = h;
- return (h < cy);
-#else /* GMP_NAIL_BITS > 0 */
- rp[n] = h & GMP_NUMB_MASK;
- return h >> GMP_NUMB_BITS;
-#endif /* GMP_NAIL_BITS > 0 */
-}
-
-
-/* Computes u2 = u0 + q u1
-
- Returns new size.
-
- FIXME: Notation in the function not quite consistent
- FIXME: Severe code duplication with hgcd_update_uv */
-
-static mp_size_t
-hgcd_update_u (struct hgcd_row *r, mp_size_t usize,
- mp_srcptr qp, mp_size_t qsize,
- /* Limbs allocated for the new u, for sanity
- checking */
- mp_size_t alloc)
-{
- mp_srcptr u0p = r[0].uvp[0];
- mp_srcptr u1p = r[1].uvp[0];
- mp_ptr u2p = r[2].uvp[0];
-
- ASSERT (usize < alloc);
-
- /* u1 = 0 is an exceptional case. Except for this, u1 should be
- normalized. */
-
- ASSERT ((usize == 1 && u1p[0] == 0) || u1p[usize - 1] != 0);
-
- /* Compute u2 = u0 + q u1 */
-
- if (usize == 1 && u1p[0] == 0)
- {
- /* u1 == 0 is a special case, then q might be large, but it
- doesn't matter. Can happen only when u0 = v1 = 1, u1 = v0 =
- 0, and hence usize == 1. */
- MPN_COPY (u2p, u0p, usize);
- }
- else if (qsize == 0)
- /* Represents a unit quotient */
- {
- mp_limb_t cy = mpn_add_n (u2p, u0p, u1p, usize);
- u2p[usize] = cy;
- usize += (cy != 0);
- }
- else if (qsize == 1)
- {
- mp_limb_t cy;
-
- cy = mpn_mul_1 (u2p, u1p, usize, qp[0]);
- cy += mpn_add_n (u2p, u2p, u0p, usize);
-
- u2p[usize] = cy;
- usize += (cy != 0);
- }
- else
- {
- if (qsize <= usize)
- mpn_mul (u2p, u1p, usize, qp, qsize);
- else
- mpn_mul (u2p, qp, qsize, u1p, usize);
-
- ASSERT_NOCARRY (mpn_add (u2p,
- u2p, usize + qsize,
- u0p, usize));
-
- usize += qsize;
- usize -= (u2p[usize - 1] == 0);
- }
- ASSERT (mpn_cmp (r[1].uvp[0], r[2].uvp[0], usize) <= 0);
- ASSERT (r[2].uvp[0][usize - 1] != 0);
-
- return usize;
-}
-
-
-/* Computes Y = R * X. No overlap allowed. */
-static mp_size_t
-hgcd2_mul_vector (struct hgcd_row *Y,
- mp_size_t alloc,
- const struct hgcd2_row *R,
- const struct hgcd_row *X, mp_size_t n)
-{
- unsigned i;
- int grow = 0;
- mp_limb_t h = 0;
-
- ASSERT (n < alloc);
-
- for (i = 0; i < 2; i++)
- {
- /* Set Y[i] = R[i, 0] X[0] + R[i,1] X[1]
- = u X[0] + v X[0] */
- mp_limb_t cy;
-
- cy = mpn_addmul2_n_1 (Y[i].uvp[0], n,
- X[0].uvp[0], R[i].u,
- X[1].uvp[0], R[i].v);
-
- if (cy)
- {
- ASSERT (n + 2 <= alloc);
- Y[i].uvp[0][n+1] = cy;
- grow = 1;
- }
- else
- h |= Y[i].uvp[0][n];
- }
- if (grow)
- return n + 2;
- else
- /* Don't add redundant zeroes */
- return n + (h != 0);
-}
-
-/* Sets (a, b, c) <-- (b, c, a) */
-#define HGCD_SWAP3_LEFT(row) \
-do { \
- struct hgcd_row __hgcd_swap4_left_tmp = row[0]; \
- row[0] = row[1]; \
- row[1] = row[2]; \
- row[2] = __hgcd_swap4_left_tmp; \
-} while (0)
-
-/* Sets (a, b, c, d) <-- (c, d, a, b) */
-#define HGCD_SWAP4_2(row) \
-do { \
- struct hgcd_row __hgcd_swap4_2_tmp = row[0]; \
- row[0] = row[2]; \
- row[2] = __hgcd_swap4_2_tmp; \
- __hgcd_swap4_2_tmp = row[1]; \
- row[1] = row[3]; \
- row[3] = __hgcd_swap4_2_tmp; \
-} while (0)
-
-static mp_size_t
-gcdext_lehmer_itch (mp_size_t asize, mp_size_t bsize)
-{
- mp_size_t ralloc = asize + 1;
- mp_size_t ualloc = bsize + 1;
-
- return 4 * ralloc + 4 * ualloc + asize;
-}
-
-static mp_size_t
-gcdext_lehmer (mp_ptr gp, mp_ptr up, mp_size_t *usize,
- mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize,
- mp_ptr tp, mp_size_t talloc)
-{
- struct hgcd_row r[4];
- /* Size and sign of u fields. The largest u should be normalized to
- this size, and except for the case u1 = 0, that is the latest
- u. */
- int rsize;
- int rsign;
-
- mp_ptr qp;
- mp_size_t qsize;
- mp_size_t ralloc = asize + 1;
- mp_size_t ualloc = bsize + 1;
-
- struct hgcd2 hgcd;
- int res;
-
- ASSERT (asize >= bsize);
- ASSERT (asize > 1);
- ASSERT (bsize > 0);
-
- ASSERT (MPN_LEQ_P (bp, bsize, ap, asize));
-
- ASSERT (4 * ralloc + 4*ualloc + asize <= talloc);
-
- r[0].rp = tp; tp += ralloc; talloc -= ralloc;
- r[1].rp = tp; tp += ralloc; talloc -= ralloc;
- r[2].rp = tp; tp += ralloc; talloc -= ralloc;
- r[3].rp = tp; tp += ralloc; talloc -= ralloc;
-
- /* Must zero out the u fields. We don't use the v fields. */
- MPN_ZERO (tp, 4 * ualloc);
-
- r[0].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
- r[1].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
- r[2].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
- r[3].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
-
- qp = tp; tp += asize; talloc -= asize;
-
- res = mpn_hgcd2_lehmer_step (&hgcd,
- ap, asize,
- bp, bsize,
- NULL);
-
- if (res == 0 || (res == 2 && hgcd.row[0].v == 0))
+ if (xn >= un)
{
- qsize = hgcd_tdiv (qp, r[1].rp, &r[1].rsize,
- ap, asize,
- bp, bsize);
- MPN_COPY (r[0].rp, bp, bsize);
- r[0].rsize = bsize;
-
- r[0].uvp[0][0] = 0;
- r[1].uvp[0][0] = 1;
- rsign = -1;
+ mpn_mul (rp, x0, xn, u0, un);
+ mpn_mul (tp, x1, xn, u1, un);
}
else
{
- const struct hgcd2_row *s = hgcd.row + (res - 2);
- rsign = hgcd.sign;
- if (res == 3)
- rsign = ~rsign;
-
- /* s[0] and s[1] correct. */
- r[0].rsize
- = mpn_hgcd2_fix (r[0].rp, ralloc,
- rsign,
- s[0].u, ap, asize,
- s[0].v, bp, bsize);
-
- r[1].rsize
- = mpn_hgcd2_fix (r[1].rp, ralloc,
- ~rsign,
- s[1].u, ap, asize,
- s[1].v, bp, bsize);
-
- r[0].uvp[0][0] = s[0].u;
- r[1].uvp[0][0] = s[1].u;
- }
- rsize = 1;
-
- while (r[0].rsize >= 2 && r[1].rsize > 0)
- {
- res = mpn_hgcd2_lehmer_step (&hgcd,
- r[0].rp, r[0].rsize,
- r[1].rp, r[1].rsize,
- NULL);
-
- if (res == 0 || (res == 2 && hgcd.row[0].v == 0))
- {
- qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize,
- r[0].rp, r[0].rsize,
- r[1].rp, r[1].rsize);
- rsize = hgcd_update_u (r, rsize, qp, qsize, ualloc);
- HGCD_SWAP3_LEFT (r);
- rsign = ~rsign;
- }
- else
- {
- const struct hgcd2_row *s = hgcd.row + (res - 2);
- int sign = hgcd.sign;
- if (res == 3)
- sign = ~sign;
-
- /* s[0] and s[1] correct. */
- r[2].rsize
- = mpn_hgcd2_fix (r[2].rp, ralloc,
- sign,
- s[0].u, r[0].rp, r[0].rsize,
- s[0].v, r[1].rp, r[1].rsize);
-
- r[3].rsize
- = mpn_hgcd2_fix (r[3].rp, ralloc,
- ~sign,
- s[1].u, r[0].rp, r[0].rsize,
- s[1].v, r[1].rp, r[1].rsize);
-
- rsize = hgcd2_mul_vector (r + 2, ralloc, s, r, rsize);
- rsign ^= sign;
- HGCD_SWAP4_2 (r);
- }
+ mpn_mul (rp, u0, un, x0, xn);
+ mpn_mul (tp, u1, un, x1, xn);
}
- if (r[1].rsize == 0)
- {
- MPN_NORMALIZE (r[0].uvp[0], rsize);
- MPN_COPY (gp, r[0].rp, r[0].rsize);
- MPN_COPY (up, r[0].uvp[0], rsize);
+ n = un + xn;
+ cy = mpn_add_n (rp, rp, tp, n);
- *usize = (rsign >= 0) ? rsize : -rsize;
- return r[0].rsize;
- }
+ if (cy > 0)
+ rp[n++] = cy;
else
- {
- mp_limb_t cy;
- mp_limb_t u;
- mp_limb_t v;
-
- gp[0] = gcdext_1 (&u, &v, r[0].rp[0], r[1].rp[0]);
- cy = mpn_addmul2_n_1 (up, rsize,
- r[0].uvp[0], u,
- r[1].uvp[0], v);
- rsize++;
- if (cy)
- up[rsize++] = cy;
- else
- MPN_NORMALIZE (up, rsize);
+ MPN_NORMALIZE (rp, n);
- *usize = (rsign >= 0) ? rsize : -rsize;
- return 1;
- }
+ return n;
}
-/* Computes Y = R * X. No overlap allowed.
-
- Temporary space is needed for two numbers smaller than the
- resulting matrix elements, i.e. bounded by 2*L <= N.
-
- FIXME: Severe code duplication with hgcd.c: hgcd_mul. */
+#define COMPUTE_V_ITCH(n) (2*(n) + 1)
+/* Computes |v| = |(g - u a)| / b, where u may be positive or
+ negative, and v is of the opposite sign. a, b are of size n, u and
+ v at most size n, and v must have space for n+1 limbs. */
static mp_size_t
-hgcd_mul_vector (struct hgcd_row *Y, mp_size_t alloc,
- const struct hgcd_row *R, mp_size_t rsize,
- const struct hgcd_row *X, mp_size_t xsize,
- mp_ptr tp, mp_size_t talloc)
-{
- unsigned i;
-
- mp_size_t ysize;
- mp_limb_t h;
- int grow;
-
- MPN_NORMALIZE (R[1].uvp[1], rsize);
- /* u1 = 0 is an exceptional case. Except for this, u1 should be
- normalized. */
- ASSERT ((xsize == 1 && X[1].uvp[0][0] == 0)
- || X[1].uvp[0][xsize - 1] != 0);
-
- if (xsize == 1 && X[1].uvp[0][0] == 0)
- {
- /* Special case. Set Y[i, 0] = R[i, 0] */
- ASSERT (X[0].uvp[0][0] == 1);
-
- if (rsize > 1)
- MPN_NORMALIZE (R[1].uvp[0], rsize);
- MPN_COPY (Y[0].uvp[0], R[0].uvp[0], rsize);
- MPN_COPY (Y[1].uvp[0], R[1].uvp[0], rsize);
-
- return rsize;
- }
-
- ysize = rsize + xsize;
- ASSERT (ysize <= talloc);
-
- h = 0; grow = 0;
-
- if (rsize >= xsize)
- {
- for (i = 0; i < 2; i++)
- {
- /* Set Y[i, 0] = R[i, 0] X[0, 0] + R[i,1] X[1, 0] */
- mp_limb_t cy;
-
- mpn_mul (Y[i].uvp[0], R[i].uvp[0], rsize, X[0].uvp[0], xsize);
- mpn_mul (tp, R[i].uvp[1], rsize, X[1].uvp[0], xsize);
-
- cy = mpn_add_n (Y[i].uvp[0], Y[i].uvp[0], tp, ysize);
-
- if (cy)
- {
- ASSERT (ysize + 1 < alloc);
- Y[i].uvp[0][ysize] = cy;
- grow = 1;
- }
- else
- h |= Y[i].uvp[0][ysize - 1];
- }
- }
- else
- {
- for (i = 0; i < 2; i++)
- {
- /* Set Y[i, 0] = R[i, 0] X[0, 0] + R[i,1] X[1, 0] */
- mp_limb_t cy;
-
- mpn_mul (Y[i].uvp[0], X[0].uvp[0], xsize, R[i].uvp[0], rsize);
- mpn_mul (tp, X[1].uvp[0], xsize, R[i].uvp[1], rsize);
-
- cy = mpn_add_n (Y[i].uvp[0], Y[i].uvp[0], tp, ysize);
-
- if (cy)
- {
- ASSERT (ysize + 1 < alloc);
- Y[i].uvp[0][ysize] = cy;
- grow = 1;
- }
- else
- h |= Y[i].uvp[0][ysize - 1];
- }
- }
-
- if (grow)
- ysize++;
- else
- ysize -= (h == 0);
-
- ASSERT ((ysize == 1 && Y[1].uvp[0][0] == 0) || Y[1].uvp[0][ysize - 1] != 0);
-
- return ysize;
-}
-
-#define COMPUTE_V_ITCH(asize, bsize, usize) \
- ((usize) + (asize) + 1 + (bsize))
-
-/* Computes |v| = |(c - u a)| / b, where u may be positive or negative,
- and v is of the opposite sign. Requires that b, c, |u| <= a. */
-static mp_size_t
-compute_v (mp_ptr vp, mp_size_t valloc,
- mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize,
- mp_srcptr cp, mp_size_t csize,
+compute_v (mp_ptr vp,
+ mp_srcptr ap, mp_srcptr bp, mp_size_t n,
+ mp_srcptr gp, mp_size_t gn,
mp_srcptr up, mp_size_t usize,
- mp_ptr tp, mp_size_t talloc)
+ mp_ptr tp)
{
mp_size_t size;
- mp_size_t vsize;
- mp_ptr rp;
-
- ASSERT (asize);
- ASSERT (bsize);
- ASSERT (csize);
- ASSERT (asize >= bsize);
-
-#if 0
- trace ("compute_v: a = %Nd\n"
- " b = %Nd\n"
- " c = %Nd\n"
- " u = %Nd\n",
- ap, asize, bp, bsize, cp, csize, up, usize);
-#endif
-
- ASSERT (usize);
-
+ mp_size_t an;
+ mp_size_t bn;
+ mp_size_t vn;
+
+ ASSERT (n > 0);
+ ASSERT (gn > 0);
+ ASSERT (usize != 0);
+
size = ABS (usize);
+ ASSERT (size <= n);
- ASSERT (size <= asize);
- ASSERT (asize + size <= talloc);
+ an = n;
+ MPN_NORMALIZE (ap, an);
- mpn_mul (tp, ap, asize, up, size);
- size += asize;
+ if (an >= size)
+ mpn_mul (tp, ap, an, up, size);
+ else
+ mpn_mul (tp, up, size, ap, an);
+
+ size += an;
- ASSERT (csize <= size);
+ ASSERT (gn <= size);
if (usize > 0)
{
- /* |v| = -v = (u a - c) / b */
+ /* |v| = -v = (u a - g) / b */
- ASSERT_NOCARRY (mpn_sub (tp, tp, size, cp, csize));
+ ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
MPN_NORMALIZE (tp, size);
if (size == 0)
return 0;
@@ -838,495 +120,432 @@ compute_v (mp_ptr vp, mp_size_t valloc,
else
{ /* usize < 0 */
/* |v| = v = (c - u a) / b = (c + |u| a) / b */
- mp_limb_t cy = mpn_add (tp, tp, size, cp, csize);
+ mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
if (cy)
- {
- ASSERT (size < talloc);
- tp[size++] = cy;
- }
+ tp[size++] = cy;
}
/* Now divide t / b. There must be no remainder */
+ bn = n;
+ MPN_NORMALIZE (bp, bn);
+ ASSERT (size >= bn);
- ASSERT (size >= bsize);
- ASSERT (size + bsize <= talloc);
- rp = tp + size;
-
- vsize = size + 1 - bsize;
- ASSERT (vsize <= valloc);
+ vn = size + 1 - bn;
+ ASSERT (vn <= n + 1);
- mpn_tdiv_qr (vp, rp, 0, tp, size, bp, bsize);
- MPN_NORMALIZE (vp, vsize);
+ /* FIXME: Use divexact. Or do the entire calculation mod 2^{n *
+ GMP_NUMB_BITS}. */
+ mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn);
+ vn -= (vp[vn-1] == 0);
/* Remainder must be zero */
#if WANT_ASSERT
{
mp_size_t i;
- for (i = 0; i < bsize; i++)
+ for (i = 0; i < bn; i++)
{
- ASSERT (rp[i] == 0);
+ ASSERT (tp[i] == 0);
}
}
#endif
- return vsize;
+ return vn;
}
-static mp_size_t
-gcdext_schoenhage_itch (mp_size_t asize, mp_size_t bsize)
-{
- mp_size_t itch;
-
- mp_size_t ralloc = asize + 1;
- mp_size_t ualloc = bsize + 1;
- /* Input size for hgcd calls */
- mp_size_t halloc = (asize + 1) / 2;
+/* Temporary storage:
- /* Storage for the rows and quotient */
- mp_size_t rstorage = 4 * ralloc + 4 * ualloc + asize;
+ Initial division: Quotient of at most an - n + 1 <= an limbs.
- /* Storage for hgcd calls */
- mp_size_t tstorage = mpn_hgcd_init_itch (halloc)
- + qstack_itch (halloc)
- + mpn_hgcd_itch (halloc);
+ Storage for u0 and u1: 2(n+1).
- /* Storage needed for final gcdext_lehmer */
- mp_size_t lstorage
- = gcdext_lehmer_itch (GCDEXT_SCHOENHAGE_THRESHOLD,
- GCDEXT_SCHOENHAGE_THRESHOLD);
+ Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
- /* Storage needed after final nhgcd_gcdext_lehmer */
- mp_size_t fstorage
- = COMPUTE_V_ITCH (GCDEXT_SCHOENHAGE_THRESHOLD,
- GCDEXT_SCHOENHAGE_THRESHOLD,
- ualloc);
+ Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
+
+ When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 3(n+1) for the cofactors.
+
+ When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
+
+ For the lehmer call after the loop, Let T denote
+ GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
+ u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
+ + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient.
+
+*/
- /* We need rstorage + MAX (tstorage, lstorage, fstorage) */
+/* Optimal choice of p seems difficult. In each iteration the division
+ * of work beteen hgcd and the updates of u0 and u1 depends on the
+ * current size of the u. It may be desirable to use a different
+ * choice of p in each iteration. Also the input size seems to matter;
+ * choosing p = n / 3 in the first iteration seems to improve
+ * performance slightly for input size just above the theshold, but
+ * degrade performance for larger inputs. */
+#define CHOOSE_P_1(n) ((n) / 2)
+#define CHOOSE_P_2(n) ((n) / 3)
- itch = tstorage;
- if (lstorage > tstorage)
- itch = lstorage;
- if (fstorage > itch)
- itch = fstorage;
+mp_size_t
+mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
+ mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
+{
+ mp_size_t talloc;
+ mp_size_t scratch;
+ mp_size_t matrix_scratch;
+ mp_size_t ualloc = n + 1;
- return rstorage + itch;
-}
+ mp_size_t un;
+ mp_ptr u0;
+ mp_ptr u1;
-#if WANT_ASSERT
-static void
-sanity_check_row (mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize,
- int sign, mp_size_t usize,
- const struct hgcd_row *r)
-{
- /* Check that x = u * a + v * b, for some v, i.e. that
- x - u*a is divisible by b. */
- mp_srcptr up = r->uvp[0];
- mp_srcptr xp = r->rp;
- mp_size_t xsize = r->rsize;
mp_ptr tp;
- mp_size_t tsize;
- mp_ptr qp;
- mp_size_t qsize;
- mp_ptr rp;
- mp_size_t i;
+
TMP_DECL;
- TMP_MARK;
- ASSERT (asize > 0 && ap[asize - 1] != 0);
- ASSERT (bsize > 0 && bp[bsize - 1] != 0);
- ASSERT (xsize == 0 || xp[xsize - 1] != 0);
- ASSERT (MPN_LEQ_P (xp, xsize, ap, asize));
- ASSERT (MPN_LEQ_P (up, usize, bp, bsize));
+ ASSERT (an >= n);
+ ASSERT (n > 0);
- MPN_NORMALIZE (up, usize);
- if (usize == 0)
- {
- ASSERT (MPN_EQUAL_P (xp, xsize, bp, bsize));
- return;
- }
-
- tp = TMP_ALLOC_LIMBS (usize + asize + 1);
- qp = TMP_ALLOC_LIMBS (usize + asize + 2 - bsize);
- rp = TMP_ALLOC_LIMBS (bsize);
-
- mpn_mul (tp, ap, asize, up, usize);
- tsize = asize + usize;
- tsize -= (tp[tsize - 1] == 0);
+ TMP_MARK;
- if (sign >= 0)
- {
- ASSERT_NOCARRY (mpn_sub (tp, tp, tsize, xp, xsize));
- MPN_NORMALIZE (tp, tsize);
- }
- else
- {
- mp_limb_t cy = mpn_add (tp, tp, tsize, xp, xsize);
- tp[tsize] = cy;
- tsize += (cy != 0);
- }
+ /* FIXME: Check for small sizes first, before setting up temporary
+ storage etc. */
+ talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
+
+ /* For initial division */
+ scratch = an - n + 1;
+ if (scratch > talloc)
+ talloc = scratch;
- if (tsize > 0)
+ if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
{
- mpn_tdiv_qr (qp, rp, 0, tp, tsize, bp, bsize);
- for (i = 0; i < bsize; i++)
- ASSERT (rp[i] == 0);
- qsize = tsize - bsize;
- qsize += (qp[qsize] != 0);
- ASSERT (MPN_LEQ_P (qp, qsize, ap, asize));
- }
- TMP_FREE;
-}
-# define ASSERT_ROW(ap, asize, bp, bsize, sign, usize, r) \
-sanity_check_row (ap, asize, bp, bsize, sign, usize, r)
-
-#else /* !WANT_ASSERT */
-# define ASSERT_ROW(ap, asize, bp, bsize, sign, usize, r)
-#endif /* !WANT_ASSERT */
+ /* For hgcd loop. */
+ mp_size_t hgcd_scratch;
+ mp_size_t update_scratch;
+ mp_size_t p1 = CHOOSE_P_1 (n);
+ mp_size_t p2 = CHOOSE_P_2 (n);
+ mp_size_t min_p = MIN(p1, p2);
+ mp_size_t max_p = MAX(p1, p2);
+ matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
+ hgcd_scratch = mpn_hgcd_itch (n - min_p);
+ update_scratch = max_p + n - 1;
+
+ scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
+ if (scratch > talloc)
+ talloc = scratch;
-static mp_size_t
-gcdext_schoenhage (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
- mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize,
- mp_ptr tp, mp_size_t talloc)
-{
- mp_size_t scratch;
- struct hgcd hgcd;
- struct qstack quotients;
- struct hgcd_row r[4];
+ /* Final mpn_gcdext_lehmer_n call. Need space for u and for
+ copies of a and b. */
+ scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
+ + 3*GCDEXT_DC_THRESHOLD;
- /* Size and sign of u fields. The largest u should be normalized to
- this size, and except for the case u1 = 0, that is the latest
- u. */
- int rsize;
- int rsign;
+ if (scratch > talloc)
+ talloc = scratch;
- mp_ptr qp;
- mp_size_t qsize;
- mp_size_t ralloc = asize + 1;
- mp_size_t ualloc = bsize + 1;
-
- ASSERT (asize >= bsize);
- ASSERT (bsize > 0);
-
- ASSERT (MPN_LEQ_P (bp, bsize, ap, asize));
-
- ASSERT (4 * ralloc + 4*ualloc + asize <= talloc);
-
- r[0].rp = tp; tp += ralloc; talloc -= ralloc;
- r[1].rp = tp; tp += ralloc; talloc -= ralloc;
- r[2].rp = tp; tp += ralloc; talloc -= ralloc;
- r[3].rp = tp; tp += ralloc; talloc -= ralloc;
-
- /* Must zero out the u fields */
- MPN_ZERO (tp, 4 * ualloc);
-
- r[0].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
- r[1].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
- r[2].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
- r[3].uvp[0] = tp; tp += ualloc; talloc -= ualloc;
+ /* Cofactors u0 and u1 */
+ talloc += 2*(n+1);
+ }
- qp = tp; tp += asize; talloc -= asize;
+ tp = TMP_ALLOC_LIMBS(talloc);
- ASSERT (asize >= bsize);
- ASSERT (bsize > 0);
- MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize;
- MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize;
+ if (an > n)
+ {
+ mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
- r[0].uvp[0][0] = 1;
- r[1].uvp[0][0] = 0;
+ if (mpn_zero_p (ap, n))
+ {
+ MPN_COPY (gp, bp, n);
+ *usizep = 0;
+ TMP_FREE;
+ return n;
+ }
+ }
- /* We don't use the v fields. */
- rsize = 1;
- rsign = 0;
+ if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
+ {
+ mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
- scratch = mpn_hgcd_init_itch ((asize + 1) / 2);
- ASSERT (scratch <= talloc);
- mpn_hgcd_init (&hgcd, (asize + 1) / 2, tp);
- tp += scratch; talloc -= scratch;
+ TMP_FREE;
+ return gn;
+ }
+
+ MPN_ZERO (tp, 2*ualloc);
+ u0 = tp; tp += ualloc;
+ u1 = tp; tp += ualloc;
{
- mp_size_t nlimbs = qstack_itch ((asize + 1) / 2);
+ /* For the first hgcd call, there are no u updates, and it makes
+ some sense to use a different choice for p. */
+
+ /* FIXME: We could trim use of temporary storage, since u0 and u1
+ are not used yet. For the hgcd call, we could swap in the u0
+ and u1 pointers for the relevant matrix elements. We could also
+ use a specialized hgcd function which computes only the last
+ two elements of the matrix. */
+
+ struct hgcd_matrix M;
+ mp_size_t p = CHOOSE_P_1 (n); /* Same as for gcd. */
+ mp_size_t nn;
+
+ mpn_hgcd_matrix_init (&M, n - p, tp);
+ nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
+ if (nn > 0)
+ {
+ ASSERT (M.n <= (n - p - 1)/2);
+ ASSERT (M.n + p <= (p + n - 1) / 2);
- ASSERT (nlimbs <= talloc);
- qstack_init (&quotients, (asize + 1) / 2, tp, nlimbs);
+ /* Temporary storage 2 (p + M->n) <= p + n - 1 */
+ n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
- tp += nlimbs;
- talloc -= nlimbs;
- scratch += nlimbs;
+ MPN_COPY (u0, M.p[1][0], M.n);
+ MPN_COPY (u1, M.p[1][1], M.n);
+ un = M.n;
+ while ( (u0[un-1] | u1[un-1] ) == 0)
+ un--;
+ }
+ else
+ {
+ /* mpn_hgcd has failed. Then either one of a or b is very
+ small, or the difference is very small. Perform one
+ subtraction followed by one division. */
+ mp_size_t gn;
+ mp_size_t updated_un = 1;
+
+ u1[0] = 1;
+
+ /* Temporary storage n + 1 */
+ n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
+ u0, u1, &updated_un, tp);
+ if (n == 0)
+ {
+ TMP_FREE;
+ return gn;
+ }
+
+ un = updated_un;
+ ASSERT (un < ualloc);
+ }
}
-
- while (ABOVE_THRESHOLD (r[0].rsize, GCDEXT_SCHOENHAGE_THRESHOLD)
- && r[1].rsize > 0)
+
+ while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
{
- mp_size_t k = r[0].rsize / 2;
- int res;
-
- ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r);
- ASSERT_ROW (ap, asize, bp, bsize, ~rsign, rsize, r + 1);
-
- if (r[1].rsize <= k)
- goto euclid;
+ struct hgcd_matrix M;
+ mp_size_t p = CHOOSE_P_2 (n);
+ mp_size_t nn;
- qstack_reset (&quotients, r[0].rsize - k);
-
- res = mpn_hgcd (&hgcd,
- r[0].rp + k, r[0].rsize - k,
- r[1].rp + k, r[1].rsize - k,
- &quotients,
- tp, talloc);
-
- if (res == 0 || res == 1)
+ mpn_hgcd_matrix_init (&M, n - p, tp);
+ nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
+ if (nn > 0)
{
- euclid:
- qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize,
- r[0].rp, r[0].rsize,
- r[1].rp, r[1].rsize);
- rsize = hgcd_update_u (r, rsize, qp, qsize, ualloc);
- ASSERT (rsize < ualloc);
-
- ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r + 2);
-
- HGCD_SWAP3_LEFT (r);
- rsign = ~rsign;
+ mp_size_t n0, n1;
+ mp_ptr t0;
+ mp_ptr t1;
+
+ t0 = tp + matrix_scratch;
+ ASSERT (M.n <= (n - p - 1)/2);
+ ASSERT (M.n + p <= (p + n - 1) / 2);
+
+ /* Temporary storage 2 (p + M->n) <= p + n - 1 */
+ n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
+
+ t1 = t0 + un;
+
+ /* FIXME: This copying could be avoided by some swapping of
+ * pointers. May need more temporary storage, though. */
+ MPN_COPY (t0, u0, un);
+ MPN_COPY (t1, u1, un);
+
+ /* By the same analysis as for mpn_hgcd_matrix_mul */
+ ASSERT (M.n + un <= ualloc);
+
+ /* Temporary storage un */
+ n0 = addmul2_n (u0, t0, t1, un,
+ M.p[0][0], M.p[1][0], M.n, t1 + un);
+ n1 = addmul2_n (u1, t0, t1, un,
+ M.p[0][1], M.p[1][1], M.n, t1 + un);
+
+ if (n0 > un)
+ un = n0;
+ if (n1 > un)
+ un = n1;
+
+ ASSERT (un < ualloc);
+ ASSERT ( (u0[un-1] | u1[un-1]) > 0);
}
else
{
- const struct hgcd_row *s = hgcd.row + (res - 2);
- int sign = hgcd.sign;
- if (res == 3)
- sign = ~sign;
-
- /* s[0] and s[1] are correct */
- r[2].rsize
- = mpn_hgcd_fix (k, r[2].rp, ralloc,
- sign, hgcd.size, s,
- r[0].rp, r[1].rp,
- tp, talloc);
-
- r[3].rsize
- = mpn_hgcd_fix (k, r[3].rp, ralloc,
- ~sign, hgcd.size, s+1,
- r[0].rp, r[1].rp,
- tp, talloc);
-
- rsize = hgcd_mul_vector (r + 2, ualloc, s, hgcd.size,
- r, rsize, tp, talloc);
- ASSERT (rsize < ualloc);
-
- rsign ^= sign;
- ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r + 2);
- ASSERT_ROW (ap, asize, bp, bsize, ~rsign, rsize, r + 3);
-
- HGCD_SWAP4_2 (r);
+ /* mpn_hgcd has failed. Then either one of a or b is very
+ small, or the difference is very small. Perform one
+ subtraction followed by one division. */
+ mp_size_t gn;
+ mp_size_t updated_un = un;
+
+ /* Temporary storage n + 1 */
+ n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
+ u0, u1, &updated_un, tp);
+ if (n == 0)
+ {
+ TMP_FREE;
+ return gn;
+ }
+
+ un = updated_un;
+ ASSERT (un < ualloc);
}
}
- if (r[1].rsize == 0)
+
+ if (mpn_zero_p (ap, n))
{
- MPN_COPY (gp, r[0].rp, r[0].rsize);
- MPN_NORMALIZE (r[0].uvp[0], rsize);
- MPN_COPY (up, r[0].uvp[0], rsize);
+ MPN_COPY (gp, bp, n);
+ MPN_NORMALIZE (u0, un);
+ MPN_COPY (up, u0, un);
+ *usizep = -un;
- *usizep = (rsign >= 0) ? rsize : - rsize;
- return r[0].rsize;
+ TMP_FREE;
+ return n;
}
- else if (r[0].rsize == 1)
+ else if (mpn_zero_p (bp, n))
{
- mp_limb_t u;
- mp_limb_t v;
- mp_limb_t cy;
-
- gp[0] = gcdext_1 (&u, &v, r[0].rp[0], r[1].rp[0]);
+ MPN_COPY (gp, ap, n);
+ MPN_NORMALIZE (u1, un);
+ MPN_COPY (up, u1, un);
+ *usizep = un;
- /* g = u r0 + v r1 = (u u0 + v u1) a + (...) b */
- cy = mpn_addmul2_n_1 (up, rsize,
- r[0].uvp[0], u,
- r[1].uvp[0], v);
-
- rsize++;
- if (cy)
- up[rsize++] = cy;
- else
- MPN_NORMALIZE (up, rsize);
+ TMP_FREE;
+ return n;
+ }
+ else if (mpn_zero_p (u0, un))
+ {
+ mp_size_t gn;
+ ASSERT (un == 1);
+ ASSERT (u1[0] == 1);
- *usizep = (rsign >= 0) ? rsize : -rsize;
- return 1;
+ /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
+ gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
+ TMP_FREE;
+ return gn;
}
else
{
- /* We have r0 = u0 a + v0 b,
- r1 = u1 a + v1 b
+ /* We have A = ... a + ... b
+ B = u0 a + u1 b
+
+ a = u1 A + ... B
+ b = -u0 A + ... B
- Compute g = u r0 + v r1 = (u u0 + v u1) a + (...) b
- In the expression (u u0 + v u1), we have
+ with bounds
- u <= r1,
- u0 <= b/r0 (except if r0 = a, which should never be the case here)
- v <= r0
- u1 <= b/r0
- */
+ |u0|, |u1| <= B / min(a, b)
+
+ Compute g = u a + v b = (u u1 - v u0) A + (...) B
+ Here, u, v are bounded by
- mp_size_t gsize;
- mp_size_t usize;
- mp_size_t vsize;
-
- /* u1 should be non-zero, and normalized */
- ASSERT (rsize);
- ASSERT (r[1].uvp[0][rsize - 1] != 0);
-#if WANT_TRACE
- trace ("gcdext: \n"
- "r0 = %Nd\n"
- "r1 = %Nd\n"
- "u0 = %Nd\n"
- "u1 = %Nd\n",
- r[0].rp, r[0].rsize, r[1].rp, r[1].rsize,
- r[0].uvp[0], rsize, r[1].uvp[0], rsize);
-#endif
- /* We don't need the space for hgcd and the quotient stack any more */
- tp -= scratch; talloc += scratch;
-
- /* Stores u in r[2] and v in r[3] */
- gsize = gcdext_lehmer (gp, r[2].uvp[0], &usize,
- r[0].rp, r[0].rsize,
- r[1].rp, r[1].rsize,
- tp, talloc);
+ |u| <= b,
+ |v| <= a
+ */
- if (usize == 0)
+ mp_size_t u0n;
+ mp_size_t u1n;
+ mp_size_t lehmer_un;
+ mp_size_t lehmer_vn;
+ mp_size_t gn;
+
+ mp_ptr lehmer_up;
+ mp_ptr lehmer_vp;
+ int negate;
+
+ lehmer_up = tp; tp += n;
+
+ /* Call mpn_gcdext_lehmer_n with copies of a and b. */
+ MPN_COPY (tp, ap, n);
+ MPN_COPY (tp + n, bp, n);
+ gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
+
+ u0n = un;
+ MPN_NORMALIZE (u0, u0n);
+ if (lehmer_un == 0)
{
- /* u == 0 ==> v = g / b == 1 ==> g = u1 a + (...) b */
+ /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
+ MPN_COPY (up, u0, u0n);
+ *usizep = -u0n;
- MPN_NORMALIZE (r[1].uvp[0], rsize);
- MPN_COPY (up, r[1].uvp[0], rsize);
- *usizep = (rsign >= 0) ? - rsize : rsize;
-
- return gsize;
+ TMP_FREE;
+ return gn;
}
- /* Compute v = (g - s r0) / r1, storing it in r[3] */
- vsize = compute_v (r[3].uvp[0], ualloc,
- r[0].rp, r[0].rsize, r[1].rp, r[1].rsize,
- gp, gsize,
- r[2].uvp[0], usize,
- tp, talloc);
+ lehmer_vp = tp;
+ /* Compute v = (g - u a) / b */
+ lehmer_vn = compute_v (lehmer_vp,
+ ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
- if (usize < 0)
+ if (lehmer_un > 0)
+ negate = 0;
+ else
{
- usize = - usize;
- rsign = ~rsign;
+ lehmer_un = -lehmer_un;
+ negate = 1;
}
- /* It's possible that u0 = 0, u1 = 1 */
- if (rsize == 1 && r[0].uvp[0][0] == 0)
- {
- /* u0 == 0 ==> u u0 + v u1 = v */
- MPN_COPY (up, r[3].uvp[0], vsize);
- *usizep = (rsign >= 0) ? vsize : - vsize;
+ u1n = un;
+ MPN_NORMALIZE (u1, u1n);
- return gsize;
+ /* It's possible that u0 = 1, u1 = 0 */
+ if (u1n == 0)
+ {
+ ASSERT (un == 1);
+ ASSERT (u0[0] == 1);
+
+ /* u1 == 0 ==> u u1 + v u0 = v */
+ MPN_COPY (up, lehmer_vp, lehmer_vn);
+ *usizep = negate ? lehmer_vn : - lehmer_vn;
+
+ TMP_FREE;
+ return gn;
}
- /* Ok, now u0, u1, u are non-zero. We may still have v == 0 */
- ASSERT (usize + rsize <= ualloc);
- ASSERT (vsize + rsize <= ualloc);
+ ASSERT (lehmer_un + u1n <= ualloc);
+ ASSERT (lehmer_vn + u0n <= ualloc);
+
+ /* Now u0, u1, u are non-zero. We may still have v == 0 */
/* Compute u u0 */
- if (usize <= rsize)
+ if (lehmer_un <= u1n)
/* Should be the common case */
- mpn_mul (up,
- r[0].uvp[0], rsize,
- r[2].uvp[0], usize);
+ mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
else
- mpn_mul (up,
- r[2].uvp[0], usize,
- r[0].uvp[0], rsize);
+ mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
- usize += rsize;
+ un = u1n + lehmer_un;
+ un -= (up[un - 1] == 0);
- /* There may be more than one zero limb, if #u0 < #u1 */
- MPN_NORMALIZE (up, usize);
- ASSERT (usize < ualloc);
-
- if (vsize)
+ if (lehmer_vn > 0)
{
mp_limb_t cy;
- /* Overwrites old r[2].uvp[0] value */
- if (vsize <= rsize)
+ /* Overwrites old u1 value */
+ if (lehmer_vn <= u0n)
/* Should be the common case */
- cy = mpn_mul (r[2].uvp[0],
- r[1].uvp[0], rsize,
- r[3].uvp[0], vsize);
+ mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
else
- cy = mpn_mul (r[2].uvp[0],
- r[3].uvp[0], vsize,
- r[1].uvp[0], rsize);
+ mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
- vsize += rsize - (cy == 0);
- ASSERT (vsize < ualloc);
+ u1n = u0n + lehmer_vn;
+ u1n -= (u1[u1n - 1] == 0);
- if (vsize <= usize)
- cy = mpn_add (up, up, usize, r[2].uvp[0], vsize);
+ if (u1n <= un)
+ {
+ cy = mpn_add (up, up, un, u1, u1n);
+ }
else
{
- cy = mpn_add (up, r[2].uvp[0], vsize, up, usize);
- usize = vsize;
+ cy = mpn_add (up, u1, u1n, up, un);
+ un = u1n;
}
- up[usize] = cy;
- usize += (cy != 0);
+ up[un] = cy;
+ un += (cy != 0);
- ASSERT (usize < ualloc);
+ ASSERT (un < ualloc);
}
- *usizep = (rsign >= 0) ? usize : -usize;
+ *usizep = negate ? -un : un;
- return gsize;
- }
-}
-
-mp_size_t
-mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
- mp_ptr ap, mp_size_t asize, mp_ptr bp, mp_size_t bsize)
-{
- ASSERT (asize >= bsize);
- ASSERT (bsize > 0);
-
- if (asize == 1)
- {
-#if GCDEXT_1_USE_BINARY
- mp_limb_t v;
- *gp = gcdext_1 (up, &v, ap[0], bp[0]);
-#else
- *gp = gcdext_1_u (up, ap[0], bp[0]);
-#endif
- *usizep = (up[0] != 0);
- ASSERT(gp[0] != 0);
- return 1;
- }
- else if (BELOW_THRESHOLD (asize, GCDEXT_SCHOENHAGE_THRESHOLD))
- {
- mp_size_t gsize;
- mp_ptr tp;
- mp_size_t talloc = gcdext_lehmer_itch (asize, bsize);
- TMP_DECL;
- TMP_MARK;
-
- tp = TMP_ALLOC_LIMBS (talloc);
- gsize = gcdext_lehmer (gp, up, usizep, ap, asize, bp, bsize,
- tp, talloc);
- TMP_FREE;
- return gsize;
- }
- else
- {
- mp_size_t gsize;
- mp_ptr tp;
- mp_size_t talloc = gcdext_schoenhage_itch (asize, bsize);
- TMP_DECL;
- TMP_MARK;
-
- tp = TMP_ALLOC_LIMBS (talloc);
- gsize = gcdext_schoenhage (gp, up, usizep, ap, asize, bp, bsize,
- tp, talloc);
TMP_FREE;
- return gsize;
+ return gn;
}
}
diff --git a/mpn/generic/gcdext_1.c b/mpn/generic/gcdext_1.c
new file mode 100644
index 000000000..efade2b4c
--- /dev/null
+++ b/mpn/generic/gcdext_1.c
@@ -0,0 +1,319 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+/* Default to binary gcdext_1, since it is best on most current machines.
+ We should teach tuneup to choose the right gcdext_1. */
+#define GCDEXT_1_USE_BINARY 1
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef NULL
+# define NULL ((void *) 0)
+#endif
+
+/* FIXME: Takes two single-word limbs. It could be extended to a
+ * function that accepts a bignum for the first input, and only
+ * returns the first co-factor. */
+
+/* Returns g, u and v such that g = u A - v B. There are three
+ different cases for the result:
+
+ g = u A - v B, 0 < u < b, 0 < v < a
+ g = A u = 1, v = 0
+ g = B u = B, v = A - 1
+
+ We always return with 0 < u <= b, 0 <= v < a.
+*/
+#if GCDEXT_1_USE_BINARY
+
+static mp_limb_t
+gcdext_1_odd (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
+{
+ mp_limb_t u0;
+ mp_limb_t v0;
+ mp_limb_t v1;
+ mp_limb_t u1;
+
+ mp_limb_t B = b;
+ mp_limb_t A = a;
+
+ /* Through out this function maintain
+
+ a = u0 A - v0 B
+ b = u1 A - v1 B
+
+ where A and B are odd. */
+
+ u0 = 1; v0 = 0;
+ u1 = b; v1 = a-1;
+
+ if (A == 1)
+ {
+ *up = u0; *vp = v0;
+ return 1;
+ }
+ else if (B == 1)
+ {
+ *up = u1; *vp = v1;
+ return 1;
+ }
+
+ while (a != b)
+ {
+ mp_limb_t mask;
+
+ ASSERT (a % 2 == 1);
+ ASSERT (b % 2 == 1);
+
+ ASSERT (0 < u0); ASSERT (u0 <= B);
+ ASSERT (0 < u1); ASSERT (u1 <= B);
+
+ ASSERT (0 <= v0); ASSERT (v0 < A);
+ ASSERT (0 <= v1); ASSERT (v1 < A);
+
+ if (a > b)
+ {
+ MP_LIMB_T_SWAP (a, b);
+ MP_LIMB_T_SWAP (u0, u1);
+ MP_LIMB_T_SWAP (v0, v1);
+ }
+
+ ASSERT (a < b);
+
+ /* Makes b even */
+ b -= a;
+
+ mask = - (mp_limb_t) (u1 < u0);
+ u1 += B & mask;
+ v1 += A & mask;
+ u1 -= u0;
+ v1 -= v0;
+
+ ASSERT (b % 2 == 0);
+
+ do
+ {
+ /* As b = u1 A + v1 B is even, while A and B are odd,
+ either both or none of u1, v1 is even */
+
+ ASSERT (u1 % 2 == v1 % 2);
+
+ mask = -(u1 & 1);
+ u1 = u1 / 2 + ((B / 2) & mask) - mask;
+ v1 = v1 / 2 + ((A / 2) & mask) - mask;
+
+ b /= 2;
+ }
+ while (b % 2 == 0);
+ }
+
+ /* Now g = a = b */
+ ASSERT (a == b);
+ ASSERT (u1 <= B);
+ ASSERT (v1 < A);
+
+ ASSERT (A % a == 0);
+ ASSERT (B % a == 0);
+ ASSERT (u0 % (B/a) == u1 % (B/a));
+ ASSERT (v0 % (A/a) == v1 % (A/a));
+
+ *up = u0; *vp = v0;
+
+ return a;
+}
+
+mp_limb_t
+mpn_gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
+{
+ unsigned shift = 0;
+ mp_limb_t g;
+ mp_limb_t u;
+ mp_limb_t v;
+
+ /* We use unsigned values in the range 0, ... B - 1. As the values
+ are uniquely determined only modulo B, we can add B at will, to
+ get numbers in range or flip the least significant bit. */
+ /* Deal with powers of two */
+ while ((a | b) % 2 == 0)
+ {
+ a /= 2; b /= 2; shift++;
+ }
+
+ if (b % 2 == 0)
+ {
+ unsigned k = 0;
+
+ do {
+ b /= 2; k++;
+ } while (b % 2 == 0);
+
+ g = gcdext_1_odd (&u, &v, a, b);
+
+ while (k--)
+ {
+ /* We have g = u a + v b, and need to construct
+ g = u'a + v'(2b).
+
+ If v is even, we can just set u' = u, v' = v/2
+ If v is odd, we can set v' = (v + a)/2, u' = u + b
+ */
+
+ if (v % 2 == 0)
+ v /= 2;
+ else
+ {
+ u = u + b;
+ v = v/2 + a/2 + 1;
+ }
+ b *= 2;
+ }
+ }
+ else if (a % 2 == 0)
+ {
+ unsigned k = 0;
+
+ do {
+ a /= 2; k++;
+ } while (a % 2 == 0);
+
+ g = gcdext_1_odd (&u, &v, a, b);
+
+ while (k--)
+ {
+ /* We have g = u a + v b, and need to construct
+ g = u'(2a) + v'b.
+
+ If u is even, we can just set u' = u/2, v' = v.
+ If u is odd, we can set u' = (u + b)/2
+ */
+
+ if (u % 2 == 0)
+ u /= 2;
+ else
+ {
+ u = u/2 + b/2 + 1;
+ v = v + a;
+ }
+ a *= 2;
+ }
+ }
+ else
+ /* Ok, both are odd */
+ g = gcdext_1_odd (&u, &v, a, b);
+
+ *up = u;
+ *vp = v;
+
+ return g << shift;
+}
+
+#else /* ! GCDEXT_1_USE_BINARY */
+static mp_limb_t
+gcdext_1_u (mp_limb_t *up, mp_limb_t a, mp_limb_t b)
+{
+ /* Maintain
+
+ a = u0 A mod B
+ b = - u1 A mod B
+ */
+ mp_limb_t u0 = 1;
+ mp_limb_t u1 = 0;
+ mp_limb_t B = b;
+
+ ASSERT (a >= b);
+ ASSERT (b > 0);
+
+ for (;;)
+ {
+ mp_limb_t q;
+
+ q = a / b;
+ a -= q * b;
+
+ if (a == 0)
+ {
+ *up = B - u1;
+ return b;
+ }
+ u0 += q * u1;
+
+ q = b / a;
+ b -= q * a;
+
+ if (b == 0)
+ {
+ *up = u0;
+ return a;
+ }
+ u1 += q * u0;
+ }
+}
+
+mp_limb_t
+mpn_gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b)
+{
+ /* Maintain
+
+ a = u0 A - v0 B
+ b = - u1 A + v1 B = (B - u1) A - (A - v1) B
+ */
+ mp_limb_t u0 = 1;
+ mp_limb_t v0 = 0;
+ mp_limb_t u1 = 0;
+ mp_limb_t v1 = 1;
+
+ mp_limb_t A = a;
+ mp_limb_t B = b;
+
+ ASSERT (a >= b);
+ ASSERT (b > 0);
+
+ for (;;)
+ {
+ mp_limb_t q;
+
+ q = a / b;
+ a -= q * b;
+
+ if (a == 0)
+ {
+ *up = B - u1;
+ *vp = A - v1;
+ return b;
+ }
+ u0 += q * u1;
+ v0 += q * v1;
+
+ q = b / a;
+ b -= q * a;
+
+ if (b == 0)
+ {
+ *up = u0;
+ *vp = v0;
+ return a;
+ }
+ u1 += q * u0;
+ v1 += q * v0;
+ }
+}
+#endif /* ! GCDEXT_1_USE_BINARY */
diff --git a/mpn/generic/gcdext_lehmer.c b/mpn/generic/gcdext_lehmer.c
new file mode 100644
index 000000000..34a503d19
--- /dev/null
+++ b/mpn/generic/gcdext_lehmer.c
@@ -0,0 +1,162 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Temporary storage: 2*(n+1) for u. n+1 for the matrix-vector
+ multiplications (if hgcd2 succeeds). If hgcd fails, n+1 limbs are
+ needed for the division, with most n for the quotient, and n+1 for
+ the product q u0. In all, 4n + 3. */
+
+mp_size_t
+mpn_gcdext_lehmer_n (mp_ptr gp, mp_ptr up, mp_size_t *usize,
+ mp_ptr ap, mp_ptr bp, mp_size_t n,
+ mp_ptr tp)
+{
+ mp_size_t ualloc = n + 1;
+
+ /* Keeps track of the second row of the reduction matrix
+ *
+ * M = (v0, v1 ; u0, u1)
+ *
+ * which correspond to the first column of the inverse
+ *
+ * M^{-1} = (u1, -v1; -u0, v0)
+ */
+
+ mp_size_t un;
+ mp_ptr u0;
+ mp_ptr u1;
+
+ MPN_ZERO (tp, 2*ualloc);
+ u0 = tp; tp += ualloc;
+ u1 = tp; tp += ualloc;
+
+ u1[0] = 1; un = 1;
+
+ /* FIXME: Handle n == 2 differently, after the loop? */
+ while (n >= 2)
+ {
+ struct hgcd_matrix1 M;
+ mp_limb_t ah, al, bh, bl;
+ mp_limb_t mask;
+
+ mask = ap[n-1] | bp[n-1];
+ ASSERT (mask > 0);
+
+ if (mask & GMP_NUMB_HIGHBIT)
+ {
+ ah = ap[n-1]; al = ap[n-2];
+ bh = bp[n-1]; bl = bp[n-2];
+ }
+ else if (n == 2)
+ {
+ /* We use the full inputs without truncation, so we can
+ safely shift left. */
+ int shift;
+
+ count_leading_zeros (shift, mask);
+ ah = MPN_EXTRACT_NUMB (shift, ap[1], ap[0]);
+ al = ap[0] << shift;
+ bh = MPN_EXTRACT_NUMB (shift, bp[1], bp[0]);
+ bl = bp[0] << shift;
+ }
+ else
+ {
+ int shift;
+
+ count_leading_zeros (shift, mask);
+ ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+ al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+ bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+ bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
+ }
+
+ /* Try an mpn_nhgcd2 step */
+ if (mpn_hgcd2 (ah, al, bh, bl, &M))
+ {
+ n = mpn_hgcd_mul_matrix1_inverse_vector (&M, n, ap, bp, tp);
+ un = mpn_hgcd_mul_matrix1_vector(&M, un, u0, u1, tp);
+ }
+ else
+ {
+ /* mpn_hgcd2 has failed. Then either one of a or b is very
+ small, or the difference is very small. Perform one
+ subtraction followed by one division. */
+ mp_size_t gn;
+ mp_size_t updated_un = un;
+
+ /* Temporary storage n + 1 */
+ n = mpn_gcdext_subdiv_step (gp, &gn, up, usize, ap, bp, n,
+ u0, u1, &updated_un, tp);
+ if (n == 0)
+ return gn;
+
+ un = updated_un;
+ }
+ }
+ if (ap[0] == 0)
+ {
+ gp[0] = bp[0];
+
+ MPN_NORMALIZE (u0, un);
+ MPN_COPY (up, u0, un);
+
+ *usize = -un;
+ return 1;
+ }
+ else if (bp[0] == 0)
+ {
+ gp[0] = ap[0];
+
+ MPN_NORMALIZE (u1, un);
+ MPN_COPY (up, u1, un);
+
+ *usize = un;
+ return 1;
+ }
+ else
+ {
+ mp_limb_t uh, vh;
+ mp_limb_t u;
+ mp_limb_t v;
+
+ gp[0] = mpn_gcdext_1 (&u, &v, ap[0], bp[0]);
+
+ /* Set up = u u1 + v u0. Keep track of size, un grows by one or
+ two limbs. */
+ uh = mpn_mul_1 (up, u1, un, u);
+ vh = mpn_addmul_1 (up, u0, un, v);
+
+ if ( (uh | vh) > 0)
+ {
+ mp_limb_t cy;
+ uh += vh;
+ up[un++] = uh;
+ if (uh < vh)
+ up[un++] = 1;
+ }
+
+ *usize = un;
+ return 1;
+ }
+}
diff --git a/mpn/generic/gcdext_subdiv_step.c b/mpn/generic/gcdext_subdiv_step.c
new file mode 100644
index 000000000..8a4ba1f42
--- /dev/null
+++ b/mpn/generic/gcdext_subdiv_step.c
@@ -0,0 +1,188 @@
+/* gcdext_subdiv_step.c.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
+ SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
+ GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static inline int
+mpn_zero_p (mp_srcptr ap, mp_size_t n)
+{
+ mp_size_t i;
+ for (i = n - 1; i >= 0; i--)
+ {
+ if (ap[i] != 0)
+ return 0;
+ }
+ return 1;
+}
+
+/* Used when mpn_hgcd or mpn_hgcd2 has failed. Then either one of a or
+ b is small, or the difference is small. Perform one subtraction
+ followed by one division. If the gcd is found, stores it in gp and
+ *gn, and returns zero. Otherwise, compute the reduced a and b,
+ return the new size, and cofactors. */
+
+/* Temporary storage: Let N be a bound both for the inputs a, b, and
+ the cofactors u0, u1 after the division step. Then up to N is
+ needed for the quotient, and N+1 for the product q u0. All in all,
+ 2N + 1. */
+mp_size_t
+mpn_gcdext_subdiv_step (mp_ptr gp, mp_size_t *gn, mp_ptr up, mp_size_t *usizep,
+ mp_ptr ap, mp_ptr bp, mp_size_t n,
+ mp_ptr u0, mp_ptr u1, mp_size_t *unp, mp_ptr tp)
+
+{
+ mp_size_t an, bn, un;
+ mp_size_t qn;
+ mp_size_t u0n;
+
+ int swapped;
+
+ an = bn = n;
+
+ ASSERT (an > 0);
+ ASSERT (ap[an-1] > 0 || bp[an-1] > 0);
+
+ MPN_NORMALIZE (ap, an);
+ MPN_NORMALIZE (bp, bn);
+
+ un = *unp;
+
+ swapped = 0;
+
+ if (UNLIKELY (an == 0))
+ {
+ return_b:
+ MPN_COPY (gp, bp, bn);
+ *gn = bn;
+
+ MPN_NORMALIZE (u0, un);
+ MPN_COPY (up, u0, un);
+
+ *usizep = swapped ? un : -un;
+
+ return 0;
+ }
+ else if (UNLIKELY (bn == 0))
+ {
+ return_a:
+ MPN_COPY (gp, ap, an);
+ *gn = an;
+
+ MPN_NORMALIZE (u1, un);
+ MPN_COPY (up, u1, un);
+
+ *usizep = swapped ? -un : un;
+
+ return 0;
+ }
+
+ /* Arrange so that a > b, subtract an -= bn, and maintain
+ normalization. */
+ if (an < bn)
+ {
+ MPN_PTR_SWAP (ap, an, bp, bn);
+ MP_PTR_SWAP (u0, u1);
+ swapped ^= 1;
+ }
+ else if (an == bn)
+ {
+ int c;
+ MPN_CMP (c, ap, bp, an);
+ if (UNLIKELY (c == 0))
+ goto return_a;
+ else if (c < 0)
+ {
+ MP_PTR_SWAP (ap, bp);
+ MP_PTR_SWAP (u0, u1);
+ swapped ^= 1;
+ }
+ }
+ /* Reduce a -= b, u1 += u0 */
+ ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn));
+ MPN_NORMALIZE (ap, an);
+ ASSERT (an > 0);
+
+ u1[un] = mpn_add_n (u1, u1, u0, un);
+ un += (u1[un] > 0);
+
+ /* Arrange so that a > b, and divide a = q b + r */
+ if (an < bn)
+ {
+ MPN_PTR_SWAP (ap, an, bp, bn);
+ MP_PTR_SWAP (u0, u1);
+ swapped ^= 1;
+ }
+ else if (an == bn)
+ {
+ int c;
+ MPN_CMP (c, ap, bp, an);
+ if (UNLIKELY (c == 0))
+ goto return_a;
+ else if (c < 0)
+ {
+ MP_PTR_SWAP (ap, bp);
+ MP_PTR_SWAP (u0, u1);
+ swapped ^= 1;
+ }
+ }
+
+ /* Reduce a -= q b, u1 += q u0 */
+ qn = an - bn + 1;
+ mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
+
+ if (mpn_zero_p (ap, bn))
+ goto return_b;
+
+ n = bn;
+
+ /* Update u1 += q u0 */
+ u0n = un;
+ MPN_NORMALIZE (u0, u0n);
+
+ if (u0n > 0)
+ {
+ qn -= (tp[qn - 1] == 0);
+
+ if (qn > u0n)
+ mpn_mul (tp + qn, tp, qn, u0, u0n);
+ else
+ mpn_mul (tp + qn, u0, u0n, tp, qn);
+
+ if (qn + u0n > un)
+ {
+ ASSERT_NOCARRY (mpn_add (u1, tp + qn, qn + u0n, u1, un));
+ un = qn + u0n;
+ un -= (u1[un-1] == 0);
+ }
+ else
+ {
+ u1[un] = mpn_add (u1, u1, un, tp + qn, qn + u0n);
+ un += (u1[un] > 0);
+ }
+ }
+
+ *unp = un;
+ return n;
+}
diff --git a/mpn/generic/hgcd.c b/mpn/generic/hgcd.c
index 8f1967b32..ae8053d77 100644
--- a/mpn/generic/hgcd.c
+++ b/mpn/generic/hgcd.c
@@ -4,7 +4,7 @@
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -21,2125 +21,624 @@ License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
-#define WANT_TRACE 0
-
-#if WANT_TRACE
-# include <stdio.h>
-# include <stdarg.h>
-#endif
-
#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"
-#if WANT_TRACE
-static void
-trace (const char *format, ...)
+/* For input of size n, matrix elements are of size at most ceil(n/2)
+ - 1, but we need two limbs extra. */
+void
+mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p)
{
- va_list args;
- va_start (args, format);
- gmp_vfprintf (stderr, format, args);
- va_end (args);
+ mp_size_t s = (n+1)/2 + 1;
+ M->alloc = s;
+ M->n = 1;
+ MPN_ZERO (p, 4 * s);
+ M->p[0][0] = p;
+ M->p[0][1] = p + s;
+ M->p[1][0] = p + 2 * s;
+ M->p[1][1] = p + 3 * s;
+
+ M->p[0][0][0] = M->p[1][1][0] = 1;
}
-#endif
-
-/* Comparison of _normalized_ numbers. */
-
-#define MPN_EQUAL_P(ap, asize, bp, bsize) \
-((asize) == (bsize) && mpn_cmp ((ap), (bp), (asize)) == 0)
-
-#define MPN_LEQ_P(ap, asize, bp, bsize) \
-((asize) < (bsize) || ((asize) == (bsize) \
- && mpn_cmp ((ap), (bp), (asize)) <= 0))
-
-#define MPN_LESS_P(ap, asize, bp, bsize) \
-((asize) < (bsize) || ((asize) == (bsize) \
- && mpn_cmp ((ap), (bp), (asize)) < 0))
-/* Extract one limb, shifting count bits left
- ________ ________
- |___xh___||___xl___|
- |____r____|
- >count <
-
- The count includes any nail bits, so it should work fine if
- count is computed using count_leading_zeros.
-*/
-
-#define MPN_EXTRACT_LIMB(count, xh, xl) \
- ((((xh) << ((count) - GMP_NAIL_BITS)) & GMP_NUMB_MASK) | \
- ((xl) >> (GMP_LIMB_BITS - (count))))
-
-
-/* Return -1 if a < x + y + z,
- 0 if a = x + y + z,
- 1 if a > x + y + z. */
-static int
-mpn_cmp_sum3 (mp_srcptr ap, mp_size_t an,
- mp_srcptr xp, mp_size_t xn,
- mp_srcptr yp, mp_size_t yn,
- mp_srcptr zp, mp_size_t zn)
+/* Updated column COL, adding in column (1-COL). */
+static void
+hgcd_matrix_update_1 (struct hgcd_matrix *M, unsigned col)
{
- mp_limb_t cy;
+ mp_limb_t c0, c1;
+ ASSERT (col < 2);
- /* Check that all limbs beyond an are zero. This should be slightly
- cheaper than fully normalizing all the input numbers. */
+ c0 = mpn_add_n (M->p[0][col], M->p[0][0], M->p[0][1], M->n);
+ c1 = mpn_add_n (M->p[1][col], M->p[1][0], M->p[1][1], M->n);
- while (xn > an)
- if (xp[--xn] > 0) return -1;
- while (yn > an)
- if (yp[--yn] > 0) return -1;
- while (zn > an)
- if (zp[--zn] > 0) return -1;
+ M->p[0][col][M->n] = c0;
+ M->p[1][col][M->n] = c1;
- /* Start by sorting so that xn >= yn >= zn. Six permutations, so we
- can't get away with less than three comparisons, at least not for
- the worst case. */
-
- if (xn < yn)
- MPN_SRCPTR_SWAP (xp, xn, yp, yn);
- if (yn < zn)
- MPN_SRCPTR_SWAP (yp, yn, zp, zn);
- if (xn < yn)
- MPN_SRCPTR_SWAP (xp, xn, yp, yn);
-
- ASSERT (an >= xn && xn >= yn && yn >= zn);
-
- /* Assume that a = x + y + z, and write the addition limb by limb.
-
- (c[1], a[0]) = x[0] + y[0] + z[0] + c[0]
- (c[2], a[1]) = x[1] + y[1] + z[1] + c[1]
- (c[k+1], a[k]) = x[k] + y[k] + z[k] + c[2]
- ...
- (c[n], a[n-1]) = x[n-1] + y[n-1] + z[n-1] + c[n-1]
-
- where the start and stop conditions are that c[0] = c[n] = 0.
- Then we can start at the high end, iterating
-
- c[k] = (c[k+1], a[k]) - x[k] - y[k] - z[k]
-
- If equality holds, then 0 <= c[k] <= 2 for all k (since for
- example 0xf + 0xf + 0xf + 2 = 0x2f). If we find c[k] < 0, then we
- know that a < x + y + z, and if we find c[k] > 2, then we know a
- > x + y + z. */
+ M->n += (c0 | c1) != 0;
+ ASSERT (M->n < M->alloc);
+}
- cy = 0;
+/* Updated column COL, adding in column Q * (1-COL). Temporary
+ * storage: qn + n <= M->alloc, where n is the size of the largest
+ * element in column 1 - COL. */
+static void
+hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn,
+ unsigned col, mp_ptr tp)
+{
+ ASSERT (col < 2);
- while (an > xn)
+ if (qn == 1)
{
- /* c[k] = (c[k+1], a[k]) */
- if (cy > 0)
- return 1;
+ mp_limb_t q = qp[0];
+ mp_limb_t c0, c1;
- cy = ap[--an];
- }
+ c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q);
+ c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q);
-#if GMP_NAIL_BITS >= 2
- while (an > yn)
- {
- if (cy > 1)
- return 1;
+ M->p[0][col][M->n] = c0;
+ M->p[1][col][M->n] = c1;
- cy = (cy << GMP_NUMB_BITS) + ap[--an];
- if (cy < xp[an])
- return -1;
- cy -= xp[an];
+ M->n += (c0 | c1) != 0;
}
- while (an > zn)
+ else
{
- mp_limb_t s;
+ unsigned row;
- if (cy > 2)
- return 1;
+ /* Carries for the unlikely case that we get both high words
+ from the multiplication and carries from the addition. */
+ mp_limb_t c[2];
+ mp_size_t n;
- cy = (cy << GMP_NUMB_BITS ) + ap[--an];
- s = xp[an] + yp[an];
- if (cy < s)
- return -1;
- cy -= s;
- }
- while (an > 0)
- {
- mp_limb_t s;
+ /* The matrix will not necessarily grow in size by qn, so we
+ need normalization in order not to overflow M. */
- if (cy > 2)
- return 1;
-
- cy = (cy << GMP_NUMB_BITS ) + ap[--an];
- s = xp[an] + yp[an] + zp[an];
- if (cy < s)
- return -1;
- cy -= s;
- }
-#else /* GMP_NAIL_BITS < 2 */
-#if GMP_NAIL_BITS == 1
-loselose
-#endif
- while (an > yn)
- {
- /* c[k] = (c[k+1], a[k]) - x[k] */
- if (cy > 1)
- return 1;
+ for (n = M->n; n + qn > M->n; n--)
+ {
+ ASSERT (n > 0);
+ if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0)
+ break;
+ }
+
+ ASSERT (qn + n <= M->alloc);
- --an;
+ for (row = 0; row < 2; row++)
+ {
+ if (qn <= n)
+ mpn_mul (tp, M->p[row][1-col], n, qp, qn);
+ else
+ mpn_mul (tp, qp, qn, M->p[row][1-col], n);
- if (cy == 1)
+ ASSERT (n + qn >= M->n);
+ c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n);
+ }
+ if (c[0] | c[1])
{
- if (ap[an] >= xp[an])
- return 1;
- cy = (ap[an] - xp[an]) & GMP_NUMB_MASK;
+ M->n = n + qn + 1;
+ M->p[0][col][n-1] = c[0];
+ M->p[1][col][n-1] = c[1];
}
else
{
- /* cy == 0 */
- if (ap[an] < xp[an])
- return -1;
- else
- cy = ap[an] - xp[an];
+ n += qn;
+ n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0;
+ if (n > M->n)
+ M->n = n;
}
}
- while (an > zn)
- {
- mp_limb_t sh, sl;
-
- /* c[k] = (c[k+1], a[k]) - x[k] - y[k] */
- if (cy > 2)
- return 1;
-
- --an;
-
- sl = xp[an] + yp[an];
- sh = (sl < xp[an]);
-
- if (cy < sh || (cy == sh && ap[an] < sl))
- return -1;
-
- sl = ap[an] - sl; /* Monkey business */
- sh = cy - sh - (sl > ap[an]);
- if (sh > 0)
- return 1;
- cy = sl;
- }
- while (an > 0)
- {
- mp_limb_t sh, sl;
- if (cy > 2)
- return 1;
-
- --an;
-
- sl = xp[an] + yp[an];
- sh = (sl < xp[an]);
-
- sl += zp[an];
- sh += sl < zp[an];
-
- if (cy < sh || (cy == sh && ap[an] < sl))
- return -1;
- sl = ap[an] - sl; /* Monkey business */
- sh = cy - sh - (sl > ap[an]);
- if (sh > 0)
- return 1;
- cy = sl;
- }
-#endif /* GMP_NAIL_BITS < 2 */
- return cy > 0;
-}
-
-/* Only the first row has v = 0, a = 1 * a + 0 * b */
-static inline int
-hgcd_start_row_p (const struct hgcd_row *r, mp_size_t n)
-{
- mp_size_t i;
- mp_srcptr vp = r->uvp[1];
-
- for (i = 0; i < n; i++)
- if (vp[i] != 0)
- return 0;
-
- return 1;
+ ASSERT (M->n < M->alloc);
}
-/* Called when r[0, 1, 2] >= W^M, r[3] < W^M. Returns the number of
- remainders that satisfy Jebelean's criterion, i.e. find the largest k
- such that
-
- r[k+1] >= max (-u[k+1], - v[k+1])
-
- r[k] - r[k-1] >= max (u[k+1] - u[k], v[k+1] - v[k])
-
- Return 0 on failure, i.e. if B or A mod B < W^M. Return 1 in case
- r0 and r1 are correct, but we still make no progress because r0 =
- A, r1 = B.
-
- Otherwise return 2, 3 or 4, the number of r:s that are correct.
- */
-static int
-hgcd_jebelean (const struct hgcd *hgcd, mp_size_t M)
+/* Multiply M by M1 from the right. Since the M1 elements fit in
+ GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs
+ temporary space M->n */
+static void
+hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1,
+ mp_ptr tp)
{
- mp_size_t L;
- unsigned bit;
-
- ASSERT (hgcd->row[0].rsize > M);
- ASSERT (hgcd->row[1].rsize > M);
- ASSERT (hgcd->row[2].rsize > M);
- ASSERT (hgcd->row[3].rsize <= M);
-
- ASSERT (MPN_LESS_P (hgcd->row[1].rp, hgcd->row[1].rsize,
- hgcd->row[0].rp, hgcd->row[0].rsize));
- ASSERT (MPN_LESS_P (hgcd->row[2].rp, hgcd->row[2].rsize,
- hgcd->row[1].rp, hgcd->row[1].rsize));
- ASSERT (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize,
- hgcd->row[2].rp, hgcd->row[2].rsize));
-
- ASSERT (mpn_cmp (hgcd->row[0].uvp[1], hgcd->row[1].uvp[1], hgcd->size) <= 0);
- ASSERT (mpn_cmp (hgcd->row[1].uvp[1], hgcd->row[2].uvp[1], hgcd->size) <= 0);
- ASSERT (mpn_cmp (hgcd->row[2].uvp[1], hgcd->row[3].uvp[1], hgcd->size) <= 0);
-
- /* The bound is really floor (N/2), which is <= M = ceil (N/2) */
- L = hgcd->size;
- ASSERT (L <= M);
-
- ASSERT (L > 0);
- ASSERT (hgcd->row[3].uvp[1][L - 1] != 0);
-
- bit = hgcd->sign < 0;
-
- /* Check r1 - r2 >= max (u2 - u1, v2 - v1) = {|u1| + |u2|, |v1| + |v2|}[bit] */
-
- if (mpn_cmp_sum3 (hgcd->row[1].rp, hgcd->row[1].rsize,
- hgcd->row[2].rp, hgcd->row[2].rsize,
- hgcd->row[1].uvp[bit], L,
- hgcd->row[2].uvp[bit], L) < 0)
- return 2 - (hgcd_start_row_p (hgcd->row, hgcd->size));
-
- /* Ok, r2 is correct */
-
- /* Check r3 >= max (-u3, -v3) = (|u3|, |v3|)[bit] */
- if (hgcd->row[3].rsize > L)
- /* Condition satisfied */
- ;
- else
+ unsigned row;
+ mp_limb_t grow;
+ for (row = 0, grow = 0; row < 2; row++)
{
- mp_size_t size;
- for (size = L; size > hgcd->row[3].rsize; size--)
- {
- if (hgcd->row[3].uvp[bit][size-1] != 0)
- return 3;
- }
- if (mpn_cmp (hgcd->row[3].rp, hgcd->row[3].uvp[bit], size) < 0)
- return 3;
- }
+ mp_limb_t c0, c1;
- /* Check r3 - r2 >= max(u3-u2, v3-v2) = {|u2| + |u3|, |v2| +|v3|}[1-bit] */
+ /* Compute (u, u') <-- (r00 u + r10 u', r01 u + r11 u') as
- if (mpn_cmp_sum3 (hgcd->row[2].rp, hgcd->row[2].rsize,
- hgcd->row[3].rp, hgcd->row[3].rsize,
- hgcd->row[2].uvp[bit ^ 1], L,
- hgcd->row[3].uvp[bit ^ 1], L) < 0)
- return 3;
-
- /* Ok, r3 is correct */
- return 4;
-}
+ t = u
+ u *= r00
+ u += r10 * u'
+ u' *= r11
+ u' += r01 * t
+ */
+ /* FIXME: Duplication with mpn_hgcd_mul_matrix1_vector. */
+ MPN_COPY (tp, M->p[row][0], M->n);
+ c0 = mpn_mul_1 (M->p[row][0], M->p[row][0], M->n, M1->u[0][0]);
+ c0 += mpn_addmul_1 (M->p[row][0], M->p[row][1], M->n, M1->u[1][0]);
+ M->p[row][0][M->n] = c0;
-/* Compute au + bv. u and v are single limbs, a and b are n limbs each.
- Stores n+1 limbs in rp, and returns the (n+2)'nd limb. */
-/* FIXME: With nails, we can instead return limb n+1, possibly including
- one non-zero nail bit. */
-static mp_limb_t
-mpn_addmul2_n_1 (mp_ptr rp, mp_size_t n,
- mp_srcptr ap, mp_limb_t u,
- mp_srcptr bp, mp_limb_t v)
-{
- mp_limb_t h;
- mp_limb_t cy;
+ c1 = mpn_mul_1 (M->p[row][1], M->p[row][1], M->n, M1->u[1][1]);
+ c1 += mpn_addmul_1 (M->p[row][1], tp, M->n, M1->u[0][1]);
+ M->p[row][1][M->n] = c1;
- h = mpn_mul_1 (rp, ap, n, u);
- cy = mpn_addmul_1 (rp, bp, n, v);
- h += cy;
-#if GMP_NAIL_BITS == 0
- rp[n] = h;
- return (h < cy);
-#else /* GMP_NAIL_BITS > 0 */
- rp[n] = h & GMP_NUMB_MASK;
- return h >> GMP_NUMB_BITS;
-#endif /* GMP_NAIL_BITS > 0 */
-}
-
-
-static inline void
-qstack_drop (struct qstack *stack)
-{
- ASSERT (stack->size_next);
- stack->limb_next -= stack->size[--stack->size_next];
-}
-
-/* Get top element */
-static inline mp_size_t
-qstack_get_0 (const struct qstack *stack,
- mp_srcptr *qp)
-{
- mp_size_t qsize;
- ASSERT (stack->size_next);
-
- qsize = stack->size[stack->size_next - 1];
- *qp = stack->limb + stack->limb_next - qsize;
-
- return qsize;
+ grow |= (c0 | c1);
+ }
+ M->n += (grow != 0);
+ ASSERT (M->n < M->alloc);
}
-/* Get element just below the top */
-static inline mp_size_t
-qstack_get_1 (const struct qstack *stack,
- mp_srcptr *qp)
-{
- mp_size_t qsize;
- ASSERT (stack->size_next >= 2);
+/* Perform a few steps, using some of mpn_hgcd2, subtraction and
+ division. Reduces the size by almost one limb or more, but never
+ below the given size s. Return new size for a and b, or 0 if no
+ more steps are possible.
- qsize = stack->size[stack->size_next - 2];
- *qp = stack->limb + stack->limb_next
- - stack->size[stack->size_next - 1]
- - qsize;
+ If hgcd2 succeds, needs temporary space for hgcd_matrix_mul_1, M->n
+ limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
+ fails, needs space for the quotient, qn <= n - s + 1 limbs, for and
+ hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
+ resulting size of $.
- return qsize;
-}
+ If N is the input size to the calling hgcd, then s = floor(N/2) +
+ 1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1
+ < N, so N is sufficient.
+*/
-/* Adds d to the element on top of the stack */
-static void
-qstack_adjust (struct qstack *stack, mp_limb_t d)
+static mp_size_t
+hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
+ struct hgcd_matrix *M, mp_ptr tp)
{
- mp_size_t qsize;
+ struct hgcd_matrix1 M1;
+ mp_limb_t mask;
+ mp_limb_t ah, al, bh, bl;
+ mp_size_t an, bn, qn;
mp_ptr qp;
+ int col;
- ASSERT (stack->size_next);
+ ASSERT (n > s);
- ASSERT_QSTACK (stack);
+ mask = ap[n-1] | bp[n-1];
+ ASSERT (mask > 0);
- if (stack->limb_next >= stack->limb_alloc)
+ if (n == s + 1)
{
- qstack_rotate (stack, 1);
- }
-
- ASSERT (stack->limb_next < stack->limb_alloc);
+ if (mask < 4)
+ goto subtract;
- qsize = stack->size[stack->size_next - 1];
- qp = stack->limb + stack->limb_next - qsize;
-
- if (qsize == 0)
- {
- qp[0] = 1 + d;
- stack->size[stack->size_next - 1] = 1;
- stack->limb_next++;
+ ah = ap[n-1]; al = ap[n-2];
+ bh = bp[n-1]; bl = bp[n-2];
}
- else
+ else if (mask & GMP_NUMB_HIGHBIT)
{
- mp_limb_t cy = mpn_add_1 (qp, qp, qsize, d);
- if (cy)
- {
- qp[qsize] = cy;
- stack->size[stack->size_next - 1]++;
- stack->limb_next++;
- }
+ ah = ap[n-1]; al = ap[n-2];
+ bh = bp[n-1]; bl = bp[n-2];
}
-
- ASSERT_QSTACK (stack);
-}
-
-/* hgcd2 operations */
-
-/* Computes P = R * S. No overlap allowed. */
-static mp_size_t
-hgcd2_mul (struct hgcd_row *P, mp_size_t alloc,
- const struct hgcd2_row *R,
- const struct hgcd_row *S, mp_size_t n)
-{
- int grow = 0;
- mp_limb_t h = 0;
- unsigned i;
- unsigned j;
-
- ASSERT (n < alloc);
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- {
- /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j]
- = u_i s0j + v_i s1j */
- mp_limb_t cy;
-
- cy = mpn_addmul2_n_1 (P[i].uvp[j], n,
- S[0].uvp[j], R[i].u,
- S[1].uvp[j], R[i].v);
- if (cy)
- {
- ASSERT (n + 2 <= alloc);
- P[i].uvp[j][n+1] = cy;
- grow = 1;
- }
- else
- h |= P[i].uvp[j][n];
- }
- if (grow)
- return n + 2;
else
- /* Don't add redundant zeroes */
- return n + (h != 0);
-}
-
-unsigned
-mpn_hgcd_max_recursion (mp_size_t n)
-{
- int count;
-
- count_leading_zeros (count, (mp_limb_t)
- (1 + n / (HGCD_SCHOENHAGE_THRESHOLD - 5)));
-
- return GMP_LIMB_BITS - count;
-}
-
-mp_size_t
-mpn_hgcd_init_itch (mp_size_t size)
-{
- /* r0 <= a, r1, r2, r3 <= b, but for simplicity, we allocate asize +
- 1 for all of them. The size of the uv:s are limited to asize / 2,
- but we allocate one extra limb. */
-
- return 4 * (size + 1) + 8 * ((size / 2) + 1);
-}
-
-void
-mpn_hgcd_init (struct hgcd *hgcd,
- mp_size_t asize,
- mp_limb_t *limbs)
-{
- unsigned i;
- unsigned j;
- mp_size_t alloc = (asize / 2) + 1;
-
- hgcd->sign = 0;
-
- for (i = 0; i < 4; i++)
- {
- hgcd->row[i].rp = limbs;
- hgcd->row[i].rsize = asize + 1; limbs += asize + 1;
- }
-
- hgcd->alloc = alloc;
- hgcd->size = alloc;
-
- for (i = 0; i < 4; i++)
- for (j = 0; j < 2; j++)
- {
- hgcd->row[i].uvp[j] = limbs;
- limbs += alloc;
- }
-}
-
-#if WANT_ASSERT
-void
-__gmpn_hgcd_sanity (const struct hgcd *hgcd,
- mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize,
- unsigned start, unsigned end)
-{
- int sign;
- unsigned i;
- mp_size_t L = hgcd->size;
- mp_ptr tp;
- mp_size_t talloc;
- mp_ptr t1p;
- mp_ptr t2p;
- const struct hgcd_row *r;
-
- ASSERT (asize >= bsize);
-
- ASSERT (L <= asize / 2);
- ASSERT (L);
-
- ASSERT (L <= asize);
- ASSERT (L <= bsize);
-
- /* NOTE: We really need only asize + bsize + 2*L, but since we're
- * swapping the pointers around, we allocate 2*(asize + L). */
- talloc = 2*(asize + L);
- tp = __GMP_ALLOCATE_FUNC_LIMBS (talloc);
- t1p = tp;
- t2p = t1p + (asize + L);
-
- sign = hgcd->sign;
- if (start % 2)
- sign = ~sign;
- for (i = start, r = &hgcd->row[start]; i < end; i++, sign = ~sign, r++)
{
- mp_size_t t1size = asize + L;
- mp_size_t t2size = bsize + L;
-
- mp_size_t k;
- for (k = hgcd->size; k < hgcd->alloc; k++)
- {
- ASSERT (r->uvp[0][k] == 0);
- ASSERT (r->uvp[1][k] == 0);
- }
-
- mpn_mul (t1p, ap, asize, r->uvp[0], L);
- mpn_mul (t2p, bp, bsize, r->uvp[1], L);
-
- if (sign < 0)
- MPN_PTR_SWAP (t1p, t1size, t2p, t2size);
+ int shift;
- MPN_NORMALIZE (t2p, t2size);
- ASSERT (t2size <= t1size);
- ASSERT_NOCARRY (mpn_sub (t1p, t1p, t1size, t2p, t2size));
-
- MPN_NORMALIZE (t1p, t1size);
- ASSERT (MPN_EQUAL_P (t1p, t1size, r->rp, r->rsize));
- }
- __GMP_FREE_FUNC_LIMBS (tp, talloc);
- for (i = start; i < end - 1; i++)
- {
- /* We should have strict inequality after each reduction step,
- but we allow equal values for input. */
- ASSERT (MPN_LEQ_P (hgcd->row[i+1].rp, hgcd->row[i+1].rsize,
- hgcd->row[i].rp, hgcd->row[i].rsize));
+ count_leading_zeros (shift, mask);
+ ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+ al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+ bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+ bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
}
-}
-#endif /* WANT_ASSERT */
-
-/* Helper functions for hgcd */
-/* Sets (a, b, c, d) <-- (b, c, d, a) */
-#define HGCD_SWAP4_LEFT(row) \
-do { \
- struct hgcd_row __hgcd_swap4_left_tmp; \
- __hgcd_swap4_left_tmp = row[0]; \
- row[0] = row[1]; \
- row[1] = row[2]; \
- row[2] = row[3]; \
- row[3] = __hgcd_swap4_left_tmp; \
-} while (0)
-
-/* Sets (a, b, c, d) <-- (d, a, b, c) */
-#define HGCD_SWAP4_RIGHT(row) \
-do { \
- struct hgcd_row __hgcd_swap4_right_tmp; \
- __hgcd_swap4_right_tmp = row[3]; \
- row[3] = row[2]; \
- row[2] = row[1]; \
- row[1] = row[0]; \
- row[0] = __hgcd_swap4_right_tmp; \
-} while (0)
-
-/* Sets (a, b, c, d) <-- (c, d, a, b) */
-#define HGCD_SWAP4_2(row) \
-do { \
- struct hgcd_row __hgcd_swap4_2_tmp; \
- __hgcd_swap4_2_tmp = row[0]; \
- row[0] = row[2]; \
- row[2] = __hgcd_swap4_2_tmp; \
- __hgcd_swap4_2_tmp = row[1]; \
- row[1] = row[3]; \
- row[3] = __hgcd_swap4_2_tmp; \
-} while (0)
-
-/* Sets (a, b, c) <-- (b, c, a) */
-#define HGCD_SWAP3_LEFT(row) \
-do { \
- struct hgcd_row __hgcd_swap4_left_tmp; \
- __hgcd_swap4_left_tmp = row[0]; \
- row[0] = row[1]; \
- row[1] = row[2]; \
- row[2] = __hgcd_swap4_left_tmp; \
-} while (0)
-
-/* Computes P = R * S. No overlap allowed.
-
- Temporary space is needed for two numbers smaller than the
- resulting matrix elements, i.e. bounded by 2*L <= N. */
-static mp_size_t
-hgcd_mul (struct hgcd_row *P, mp_size_t alloc,
- const struct hgcd_row *R, mp_size_t rsize,
- const struct hgcd_row *S, mp_size_t ssize,
- mp_ptr tp, mp_size_t talloc)
-{
- unsigned i;
- unsigned j;
-
- mp_size_t psize;
- mp_limb_t h = 0;
- int grow = 0;
- MPN_NORMALIZE (R[1].uvp[1], rsize);
- ASSERT (S[1].uvp[1][ssize - 1] != 0);
-
- psize = rsize + ssize;
- ASSERT (psize <= talloc);
-
- if (rsize >= ssize)
- {
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- {
- /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */
- mp_limb_t cy;
-
- mpn_mul (P[i].uvp[j], R[i].uvp[0], rsize, S[0].uvp[j], ssize);
- mpn_mul (tp, R[i].uvp[1], rsize, S[1].uvp[j], ssize);
-
- cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize);
-
- if (cy)
- {
- ASSERT (psize + 1 < alloc);
- P[i].uvp[j][psize] = cy;
- grow = 1;
- }
- else
- h |= P[i].uvp[j][psize - 1];
- }
- }
- else
+ /* Try an mpn_hgcd2 step */
+ if (mpn_hgcd2 (ah, al, bh, bl, &M1))
{
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- {
- /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */
- mp_limb_t cy;
-
- mpn_mul (P[i].uvp[j], S[0].uvp[j], ssize, R[i].uvp[0], rsize);
- mpn_mul (tp, S[1].uvp[j], ssize, R[i].uvp[1], rsize);
-
- cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize);
-
- if (cy)
- {
- ASSERT (psize + 1 < alloc);
- P[i].uvp[j][psize] = cy;
- grow = 1;
- }
- else
- h |= P[i].uvp[j][psize - 1];
- }
- }
-
- if (grow)
- return psize + 1;
- else
- return psize - (h == 0);
-}
-
-/* Computes R = W^k s->r + s->u A' - s->v B', which must be
- non-negative. W denotes 2^(GMP_NUMB_BITS). Temporary space needed
- is k + uvsize <= M + L = N.
-
- Must have v > 0, v >= u. */
-
-mp_size_t
-mpn_hgcd_fix (mp_size_t k,
- mp_ptr rp, mp_size_t ralloc,
- int sign, mp_size_t uvsize,
- const struct hgcd_row *s,
- mp_srcptr ap,
- mp_srcptr bp,
- mp_ptr tp, mp_size_t talloc)
-{
- mp_size_t tsize;
- mp_limb_t cy;
- mp_size_t rsize;
- mp_srcptr up;
- mp_srcptr vp;
+ /* Multiply M <- M * M1 */
+ hgcd_matrix_mul_1 (M, &M1, tp);
- up = s->uvp[0]; vp = s->uvp[1];
- MPN_NORMALIZE (vp, uvsize);
- ASSERT (uvsize > 0);
-
- if (sign < 0)
- {
- MP_SRCPTR_SWAP (up, vp);
- MP_SRCPTR_SWAP (ap, bp);
+ /* Multiply M1^{-1} (a;b) */
+ return mpn_hgcd_mul_matrix1_inverse_vector (&M1, n, ap, bp, tp);
}
- tsize = k + uvsize;
+ subtract:
+ /* There are two ways in which mpn_hgcd2 can fail. Either one of ah and
+ bh was too small, or ah, bh were (almost) equal. Perform one
+ subtraction step (for possible cancellation of high limbs),
+ followed by one division. */
- ASSERT (k + s->rsize <= ralloc);
- ASSERT (tsize <= talloc);
- ASSERT (tsize <= ralloc);
+ /* Since we must ensure that #(a-b) > s, we handle cancellation of
+ high limbs explicitly up front. (FIXME: Or is it better to just
+ subtract, normalize, and use an addition to undo if it turns out
+ the the difference is too small?) */
+ for (an = n; an > s; an--)
+ if (ap[an-1] != bp[an-1])
+ break;
- ASSERT (rp != s->rp);
-
- /* r = W^k s + u a */
- if (uvsize <= k)
- mpn_mul (rp, ap, k, up, uvsize);
- else
- mpn_mul (rp, up, uvsize, ap, k);
+ if (an == s)
+ return 0;
- if (uvsize <= s->rsize)
+ /* Maintain a > b. When needed, swap a and b, and let col keep track
+ of how to update M. */
+ if (ap[an-1] > bp[an-1])
{
- cy = mpn_add (rp + k, s->rp, s->rsize, rp + k, uvsize);
- rsize = k + s->rsize;
+ /* a is largest. In the subtraction step, we need to update
+ column 1 of M */
+ col = 1;
}
else
{
- cy = mpn_add (rp + k, rp + k, uvsize, s->rp, s->rsize);
- rsize = k + uvsize;
- }
-
- if (cy)
- {
- ASSERT (rsize < ralloc);
- rp[rsize++] = cy;
+ MP_PTR_SWAP (ap, bp);
+ col = 0;
}
- /* r -= v b */
-
- if (uvsize <= k)
- mpn_mul (tp, bp, k, vp, uvsize);
- else
- mpn_mul (tp, vp, uvsize, bp, k);
-
- ASSERT_NOCARRY (mpn_sub (rp, rp, rsize, tp, tsize));
- MPN_NORMALIZE (rp, rsize);
-
- return rsize;
-}
+ bn = n;
+ MPN_NORMALIZE (bp, bn);
+ if (bn <= s)
+ return 0;
+
+ /* We have #a, #b > s. When is it possible that #(a-b) < s? For
+ cancellation to happen, the numbers must be of the form
-/* Compute r2 = r0 - q r1 */
-static void
-hgcd_update_r (struct hgcd_row *r, mp_srcptr qp, mp_size_t qsize)
-{
- mp_srcptr r0p = r[0].rp;
- mp_srcptr r1p = r[1].rp;
- mp_ptr r2p = r[2].rp;
- mp_size_t r0size = r[0].rsize;
- mp_size_t r1size = r[1].rsize;
+ a = x + 1, 0, ..., 0, al
+ b = x , GMP_NUMB_MAX, ..., GMP_NUMB_MAX, bl
- ASSERT (MPN_LESS_P (r1p, r1size, r0p, r0size));
+ where al, bl denotes the least significant k limbs. If al < bl,
+ then #(a-b) < k, and if also high(al) != 0, high(bl) != GMP_NUMB_MAX,
+ then #(a-b) = k. If al >= bl, then #(a-b) = k + 1. */
- if (qsize == 0)
- {
- ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r1p, r1size));
- }
- else if (qsize == 1)
+ if (ap[an-1] == bp[an-1] + 1)
{
- mp_size_t size;
- mp_limb_t cy = mpn_mul_1 (r2p, r1p, r1size, qp[0]);
- size = r1size;
+ mp_size_t k;
+ int c;
+ for (k = an-1; k > s; k--)
+ if (ap[k-1] != 0 || bp[k-1] != GMP_NUMB_MAX)
+ break;
- if (cy)
+ MPN_CMP (c, ap, bp, k);
+ if (c < 0)
{
- ASSERT (size < r0size);
- r2p[size++] = cy;
+ mp_limb_t cy;
+
+ /* The limbs from k and up are cancelled. */
+ if (k == s)
+ return 0;
+ cy = mpn_sub_n (ap, ap, bp, k);
+ ASSERT (cy == 1);
+ an = k;
}
-
- ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r2p, size));
- }
- else
- {
- mp_size_t size = r1size + qsize;
- ASSERT (size <= r0size + 1);
-
- if (qsize <= r1size)
- mpn_mul (r2p, r1p, r1size, qp, qsize);
else
- mpn_mul (r2p, qp, qsize, r1p, r1size);
-
- if (size > r0size)
{
- ASSERT (size == r0size + 1);
- size--;
- ASSERT (r2p[size] == 0);
+ ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, k));
+ ap[k] = 1;
+ an = k + 1;
}
-
- ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r2p, size));
}
+ else
+ ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, an));
+
+ ASSERT (an > s);
+ ASSERT (ap[an-1] > 0);
+ ASSERT (bn > s);
+ ASSERT (bp[bn-1] > 0);
+
+ hgcd_matrix_update_1 (M, col);
- MPN_NORMALIZE (r[2].rp, r0size);
- r[2].rsize = r0size;
-
- ASSERT (MPN_LESS_P (r2p, r0size, r1p, r1size));
-}
-
-/* Compute (u2, v2) = (u0, v0) + q (u1, v1)
- Return the size of the largest u,v element.
- Caller must ensure that usize + qsize <= available storage */
-static mp_size_t
-hgcd_update_uv (struct hgcd_row *r, mp_size_t usize,
- mp_srcptr qp, mp_size_t qsize)
-{
- unsigned i;
- mp_size_t grow;
-
- ASSERT (r[1].uvp[1][usize - 1] != 0);
-
- /* Compute u2 = u0 + q u1 */
-
- if (qsize == 0)
+ if (an < bn)
{
- /* Represents a unit quotient */
- mp_limb_t cy;
-
- cy = mpn_add_n (r[2].uvp[0], r[0].uvp[0], r[1].uvp[0], usize);
- r[2].uvp[0][usize] = cy;
-
- cy = mpn_add_n (r[2].uvp[1], r[0].uvp[1], r[1].uvp[1], usize);
- r[2].uvp[1][usize] = cy;
- grow = cy;
+ MPN_PTR_SWAP (ap, an, bp, bn);
+ col ^= 1;
}
- else if (qsize == 1)
+ else if (an == bn)
{
- mp_limb_t q = qp[0];
- for (i = 0; i < 2; i++)
+ int c;
+ MPN_CMP (c, ap, bp, an);
+ if (c < 0)
{
- mp_srcptr u0p = r[0].uvp[i];
- mp_srcptr u1p = r[1].uvp[i];
- mp_ptr u2p = r[2].uvp[i];
- mp_limb_t cy;
-
- /* Too bad we don't have an addmul_1 with distinct source and
- destination */
- cy = mpn_mul_1 (u2p, u1p, usize, q);
- cy += mpn_add_n (u2p, u2p, u0p, usize);
-
- u2p[usize] = cy;
- grow = cy != 0;
+ MP_PTR_SWAP (ap, bp);
+ col ^= 1;
}
}
- else
- {
- for (i = 0; i < 2; i++)
- {
- mp_srcptr u0p = r[0].uvp[i];
- mp_srcptr u1p = r[1].uvp[i];
- mp_ptr u2p = r[2].uvp[i];
-
- if (qsize <= usize)
- mpn_mul (u2p, u1p, usize, qp, qsize);
- else
- mpn_mul (u2p, qp, qsize, u1p, usize);
- ASSERT_NOCARRY (mpn_add (u2p, u2p, usize + qsize, u0p, usize));
- grow = qsize - ((u2p[usize + qsize - 1]) == 0);
- }
- }
+ /* Divide a / b. */
+ qn = an + 1 - bn;
- usize += grow;
+ /* FIXME: We could use an approximate division, that may return a
+ too small quotient, and only guarantess that the size of r is
+ almost the size of b. FIXME: Let ap and remainder overlap. */
+ mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
+ qn -= (tp[qn -1] == 0);
- /* The values should be allocated with one limb margin */
- ASSERT (mpn_cmp (r[1].uvp[0], r[2].uvp[0], usize) <= 0);
- ASSERT (mpn_cmp (r[1].uvp[1], r[2].uvp[1], usize) <= 0);
- ASSERT (r[2].uvp[1][usize - 1] != 0);
+ /* Normalize remainder */
+ an = bn;
+ for ( ; an > s; an--)
+ if (ap[an-1] > 0)
+ break;
- return usize;
-}
-
-/* Compute r0 = r2 + q r1, and the corresponding uv */
-static void
-hgcd_backup (struct hgcd_row *r, mp_size_t usize,
- mp_srcptr qp, mp_size_t qsize)
-{
- mp_ptr r0p = r[0].rp;
- mp_srcptr r1p = r[1].rp;
- mp_srcptr r2p = r[2].rp;
- mp_size_t r0size;
- mp_size_t r1size = r[1].rsize;
- mp_size_t r2size = r[2].rsize;
-
- mp_ptr u0p = r[0].uvp[0];
- mp_ptr v0p = r[0].uvp[1];
- mp_srcptr u1p = r[1].uvp[0];
- mp_srcptr v1p = r[1].uvp[1];
- mp_srcptr u2p = r[2].uvp[0];
- mp_srcptr v2p = r[2].uvp[1];
-
- ASSERT (MPN_LESS_P (r2p, r2size, r1p, r1size));
-
- if (qsize == 0)
- {
- /* r0 = r2 + r1 */
- mp_limb_t cy = mpn_add (r0p, r1p, r1size, r2p, r2size);
- r0size = r1size;
- if (cy)
- r0p[r0size++] = cy;
-
- /* (u0,v0) = (u2,v2) - (u1, v1) */
-
- ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u1p, usize));
- ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v1p, usize));
- }
- else if (qsize == 1)
+ if (an <= s)
{
- /* r0 = r2 + q r1
-
- Just like for mpn_addmul_1, the result is the same size as r1, or
- one limb larger. */
-
+ /* Quotient is too large */
mp_limb_t cy;
- cy = mpn_mul_1 (r0p, r1p, r1size, qp[0]);
- cy += mpn_add (r0p, r0p, r1size, r2p, r2size);
-
- r0size = r1size;
- if (cy)
- r0p[r0size++] = cy;
-
- /* (u0,v0) = (u2,v2) - q (u1, v1) */
-
- ASSERT_NOCARRY (mpn_mul_1 (u0p, u1p, usize, qp[0]));
- ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize));
-
- ASSERT_NOCARRY (mpn_mul_1 (v0p, v1p, usize, qp[0]));
- ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v0p, usize));
- }
- else
- {
- /* r0 = r2 + q r1
-
- Result must be of size r1size + q1size - 1, or one limb
- larger. */
-
- mp_size_t size;
-
- r0size = r1size + qsize;
- if (r1size >= qsize)
- mpn_mul (r0p, r1p, r1size, qp, qsize);
- else
- mpn_mul (r0p, qp, qsize, r1p, r1size);
-
- ASSERT_NOCARRY (mpn_add (r0p, r0p, r0size, r2p, r2size));
+ cy = mpn_add (ap, bp, bn, ap, an);
- r0size -= (r0p[r0size-1] == 0);
-
- /* (u0,v0) = (u2,v2) - q (u1, v1) */
-
- /* We must have
-
- usize >= #(q u1) >= qsize + #u1 - 1
-
- which means that u1 must have at least
-
- usize - #u1 >= qsize - 1
-
- zero limbs at the high end, and similarly for v1. */
-
- ASSERT (qsize <= usize);
- size = usize - qsize + 1;
-#if WANT_ASSERT
- {
- mp_size_t i;
- for (i = size; i < usize; i++)
- {
- ASSERT (u1p[i] == 0);
- ASSERT (v1p[i] == 0);
- }
- }
-#endif
- /* NOTE: Needs an extra limb for the u,v values */
-
- if (qsize <= size)
- {
- mpn_mul (u0p, u1p, size, qp, qsize);
- mpn_mul (v0p, v1p, size, qp, qsize);
- }
- else
+ if (cy > 0)
{
- mpn_mul (u0p, qp, qsize, u1p, size);
- mpn_mul (v0p, qp, qsize, v1p, size);
+ ASSERT (bn < n);
+ ap[bn] = cy;
+ bp[bn] = 0;
+ bn++;
}
- /* qsize + size = usize + 1 */
- ASSERT (u0p[usize] == 0);
- ASSERT (v0p[usize] == 0);
-
- ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize));
- ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v0p, usize));
+ MPN_DECR_U (tp, qn, 1);
+ qn -= (tp[qn-1] == 0);
}
- r[0].rsize = r0size;
-}
-
-/* Called after HGCD_SWAP4_RIGHT, to adjust the size field. Large
- numbers in row 0 don't count, and are overwritten. */
-static void
-hgcd_normalize (struct hgcd *hgcd)
-{
- mp_size_t size = hgcd->size;
-
- /* v3 should always be the largest element */
- while (size > 0 && hgcd->row[3].uvp[1][size - 1] == 0)
- {
- size--;
- /* Row 0 is about to be overwritten. We must zero out unused limbs */
- hgcd->row[0].uvp[0][size] = 0;
- hgcd->row[0].uvp[1][size] = 0;
-
- ASSERT (hgcd->row[1].uvp[0][size] == 0);
- ASSERT (hgcd->row[1].uvp[1][size] == 0);
- ASSERT (hgcd->row[2].uvp[0][size] == 0);
- ASSERT (hgcd->row[2].uvp[1][size] == 0);
- ASSERT (hgcd->row[3].uvp[0][size] == 0);
- }
+ if (qn > 0)
+ hgcd_matrix_update_q (M, tp, qn, col, tp + qn);
- hgcd->size = size;
+ return bn;
}
-int
-mpn_hgcd2_lehmer_step (struct hgcd2 *hgcd,
- mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize,
- struct qstack *quotients)
+/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
+ with elements of size at most (n+1)/2 - 1. Returns new size of a,
+ b, or zero if no reduction is possible. */
+mp_size_t
+mpn_hgcd_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n,
+ struct hgcd_matrix *M, mp_ptr tp)
{
- mp_limb_t ah;
- mp_limb_t al;
- mp_limb_t bh;
- mp_limb_t bl;
+ mp_size_t s = n/2 + 1;
+ mp_size_t nn;
- ASSERT (asize >= bsize);
- ASSERT (MPN_LEQ_P (bp, bsize, ap, asize));
+ ASSERT (n > s);
+ ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
- if (bsize < 2)
+ nn = hgcd_step (n, ap, bp, s, M, tp);
+ if (!nn)
return 0;
-#if 0 && WANT_TRACE
- trace ("lehmer_step:\n"
- " a = %Nd\n"
- " b = %Nd\n",
- ap, asize, bp, bsize);
-#endif
-#if WANT_TRACE
- trace ("lehmer_step: asize = %d, bsize = %d\n", asize, bsize);
-#endif
-
- /* The case asize == 2 is needed to take care of values that are
- between one and two *full* limbs in size. */
- if (asize == 2 || (ap[asize-1] & GMP_NUMB_HIGHBIT))
- {
- if (bsize < asize)
- return 0;
-
- al = ap[asize - 2];
- ah = ap[asize - 1];
-
- ASSERT (asize == bsize);
- bl = bp[asize - 2];
- bh = bp[asize - 1];
- }
- else
- {
- unsigned shift;
- if (bsize + 1 < asize)
- return 0;
-
- /* We want two *full* limbs */
- ASSERT (asize > 2);
-
- count_leading_zeros (shift, ap[asize-1]);
-#if 0 && WANT_TRACE
- trace ("shift = %d\n", shift);
-#endif
- if (bsize == asize)
- bh = MPN_EXTRACT_LIMB (shift, bp[asize - 1], bp[asize - 2]);
- else
- {
- ASSERT (asize == bsize + 1);
- bh = bp[asize - 2] >> (GMP_LIMB_BITS - shift);
- }
-
- bl = MPN_EXTRACT_LIMB (shift, bp[asize - 2], bp[asize - 3]);
-
- al = MPN_EXTRACT_LIMB (shift, ap[asize - 2], ap[asize - 3]);
- ah = MPN_EXTRACT_LIMB (shift, ap[asize - 1], ap[asize - 2]);
- }
-
-#if WANT_TRACE
- trace ("lehmer_step: ah = %lx, al = %lx, bh = %lx, bl = %lx\n",
- (unsigned long) ah, (unsigned long) al,
- (unsigned long) bh, (unsigned long) bl);
-#endif
- return mpn_hgcd2 (hgcd, ah, al, bh, bl, quotients);
-}
-
-/* Called when r2 has been computed, and it is too small. Top element
- on the stack is r0/r1. One backup step is needed. */
-static int
-hgcd_small_1 (struct hgcd *hgcd, mp_size_t M,
- struct qstack *quotients)
-{
- mp_srcptr qp;
- mp_size_t qsize;
-
- if (hgcd_start_row_p (hgcd->row, hgcd->size))
+ for (;;)
{
- qstack_drop (quotients);
- return 0;
+ n = nn;
+ ASSERT (n > s);
+ nn = hgcd_step (n, ap, bp, s, M, tp);
+ if (!nn )
+ return n;
}
-
- HGCD_SWAP4_RIGHT (hgcd->row);
- hgcd_normalize (hgcd);
-
- qsize = qstack_get_1 (quotients, &qp);
-
- hgcd_backup (hgcd->row, hgcd->size, qp, qsize);
- hgcd->sign = ~hgcd->sign;
-
-#if WANT_ASSERT
- qstack_rotate (quotients, 0);
-#endif
-
- return hgcd_jebelean (hgcd, M);
}
-/* Called when r3 has been computed, and is small enough. Two backup
- steps are needed. */
-static int
-hgcd_small_2 (struct hgcd *hgcd, mp_size_t M,
- const struct qstack *quotients)
+/* Multiply M by M1 from the right. Needs 4*(M->n + M1->n) + 5 limbs
+ of temporary storage (see mpn_matrix22_mul_itch). */
+void
+mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1,
+ mp_ptr tp)
{
- mp_srcptr qp;
- mp_size_t qsize;
+ mp_size_t n;
- if (hgcd_start_row_p (hgcd->row + 2, hgcd->size))
- return 0;
+ /* About the new size of M:s elements. Since M1's diagonal elements
+ are > 0, no element can decrease. The new elements are of size
+ M->n + M1->n, one limb more or less. The computation of the
+ matrix product produces elements of size M->n + M1->n + 1. But
+ the true size, after normalization, may be two limbs smaller. */
- qsize = qstack_get_0 (quotients, &qp);
- hgcd_backup (hgcd->row+1, hgcd->size, qp, qsize);
+ /* FIXME: Strassen multiplication gives only a small speedup. In FFT
+ multiplication range, this function could be sped up quite a lot
+ using invariance. */
+ ASSERT (M->n + M1->n < M->alloc);
- if (hgcd_start_row_p (hgcd->row + 1, hgcd->size))
- return 0;
+ ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1]
+ | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0);
- qsize = qstack_get_1 (quotients, &qp);
- hgcd_backup (hgcd->row, hgcd->size, qp, qsize);
+ ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1]
+ | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0);
- return hgcd_jebelean (hgcd, M);
-}
-
-static void
-hgcd_start (struct hgcd *hgcd,
- mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize)
-{
- MPN_COPY (hgcd->row[0].rp, ap, asize);
- hgcd->row[0].rsize = asize;
+ mpn_matrix22_mul (M->p[0][0], M->p[0][1],
+ M->p[1][0], M->p[1][1], M->n,
+ M1->p[0][0], M1->p[0][1],
+ M1->p[1][0], M1->p[1][1], M1->n, tp);
- MPN_COPY (hgcd->row[1].rp, bp, bsize);
- hgcd->row[1].rsize = bsize;
+ n = M->n + M1->n + 1;
+ n -= ((M->p[0][0][n-1] | M->p[0][1][n-1]
+ | M->p[1][0][n-1] | M->p[1][1][n-1]) == 0);
+ n -= ((M->p[0][0][n-1] | M->p[0][1][n-1]
+ | M->p[1][0][n-1] | M->p[1][1][n-1]) == 0);
- hgcd->sign = 0;
- if (hgcd->size != 0)
- {
- /* We must zero out the uv array */
- unsigned i;
- unsigned j;
+ ASSERT ((M->p[0][0][n-1] | M->p[0][1][n-1]
+ | M->p[1][0][n-1] | M->p[1][1][n-1]) > 0);
- for (i = 0; i < 4; i++)
- for (j = 0; j < 2; j++)
- MPN_ZERO (hgcd->row[i].uvp[j], hgcd->size);
- }
-#if WANT_ASSERT
- {
- unsigned i;
- unsigned j;
- mp_size_t k;
-
- for (i = 0; i < 4; i++)
- for (j = 0; j < 2; j++)
- for (k = hgcd->size; k < hgcd->alloc; k++)
- ASSERT (hgcd->row[i].uvp[j][k] == 0);
- }
-#endif
-
- hgcd->size = 1;
- hgcd->row[0].uvp[0][0] = 1;
- hgcd->row[1].uvp[1][0] = 1;
+ M->n = n;
}
-/* Performs one euclid step on r0, r1. Returns >= 0 if hgcd should be
- terminated, -1 if we should go on */
-static int
-euclid_step (struct hgcd *hgcd, mp_size_t M,
- struct qstack *quotients)
+/* Multiplies the least significant p limbs of (a;b) by M^-1.
+ Temporary space needed: 2 * (p + M->n)*/
+mp_size_t
+mpn_hgcd_matrix_adjust (struct hgcd_matrix *M,
+ mp_size_t n, mp_ptr ap, mp_ptr bp,
+ mp_size_t p, mp_ptr tp)
{
- mp_size_t asize;
+ /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b)
+ = (r11 a - r01 b; - r10 a + r00 b */
- mp_size_t qsize;
- mp_size_t rsize;
- mp_ptr qp;
- mp_ptr rp;
+ mp_ptr t0 = tp;
+ mp_ptr t1 = tp + p + M->n;
+ mp_limb_t ah, bh;
+ mp_limb_t cy;
- asize = hgcd->row[0].rsize;
- rsize = hgcd->row[1].rsize;
- qsize = asize - rsize + 1;
+ ASSERT (p + M->n < n);
- /* Make sure we have space on stack */
- ASSERT_QSTACK (quotients);
+ /* First compute the two values depending on a, before overwriting a */
- if (qsize > quotients->limb_alloc - quotients->limb_next)
- {
- qstack_rotate (quotients,
- qsize - (quotients->limb_alloc - quotients->limb_next));
- ASSERT (quotients->size_next < QSTACK_MAX_QUOTIENTS);
- }
- else if (quotients->size_next >= QSTACK_MAX_QUOTIENTS)
+ if (M->n >= p)
{
- qstack_rotate (quotients, 0);
+ mpn_mul (t0, M->p[1][1], M->n, ap, p);
+ mpn_mul (t1, M->p[1][0], M->n, ap, p);
}
-
- ASSERT (qsize <= quotients->limb_alloc - quotients->limb_next);
-
- qp = quotients->limb + quotients->limb_next;
-
- rp = hgcd->row[2].rp;
- mpn_tdiv_qr (qp, rp, 0, hgcd->row[0].rp, asize, hgcd->row[1].rp, rsize);
- MPN_NORMALIZE (rp, rsize);
- hgcd->row[2].rsize = rsize;
-
- if (qp[qsize - 1] == 0)
- qsize--;
-
- if (qsize == 1 && qp[0] == 1)
- qsize = 0;
-
- quotients->size[quotients->size_next++] = qsize;
- quotients->limb_next += qsize;
-
- ASSERT_QSTACK (quotients);
-
- /* Update u and v */
- ASSERT (hgcd->size + qsize <= hgcd->alloc);
- hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, qp, qsize);
- ASSERT (hgcd->size < hgcd->alloc);
-
- if (hgcd->row[2].rsize <= M)
- return hgcd_small_1 (hgcd, M, quotients);
else
{
- /* Keep this remainder */
- hgcd->sign = ~hgcd->sign;
-
- HGCD_SWAP4_LEFT (hgcd->row);
- return -1;
+ mpn_mul (t0, ap, p, M->p[1][1], M->n);
+ mpn_mul (t1, ap, p, M->p[1][0], M->n);
}
-}
-/* Called when values have been computed in r[0] and r[1], and the
- latter value is too large, and we know that it's not much too
- large. Returns the updated size for the uv matrix. */
-static mp_size_t
-hgcd_adjust (struct hgcd_row *r, mp_size_t size,
- struct qstack *quotients)
-{
- mp_limb_t c0;
- mp_limb_t c1;
- mp_limb_t d;
-
- /* Compute the correct r1. We have r1' = r1 - d r0, and we always
- have d = 1 or 2. */
+ /* Update a */
+ MPN_COPY (ap, t0, p);
+ ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n);
- ASSERT_NOCARRY (mpn_sub (r[1].rp, r[1].rp, r[1].rsize, r[0].rp, r[0].rsize));
+ if (M->n >= p)
+ mpn_mul (t0, M->p[0][1], M->n, bp, p);
+ else
+ mpn_mul (t0, bp, p, M->p[0][1], M->n);
- MPN_NORMALIZE (r[1].rp, r[1].rsize);
+ cy = mpn_sub (ap, ap, n, t0, p + M->n);
+ ASSERT (cy <= ah);
+ ah -= cy;
- if (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize))
- {
- c0 = mpn_add_n (r[1].uvp[0], r[1].uvp[0], r[0].uvp[0], size);
- c1 = mpn_add_n (r[1].uvp[1], r[1].uvp[1], r[0].uvp[1], size);
- d = 1;
- }
+ /* Update b */
+ if (M->n >= p)
+ mpn_mul (t0, M->p[0][0], M->n, bp, p);
else
- {
- ASSERT_NOCARRY (mpn_sub (r[1].rp, r[1].rp, r[1].rsize, r[0].rp, r[0].rsize));
- MPN_NORMALIZE (r[1].rp, r[1].rsize);
- ASSERT (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize));
+ mpn_mul (t0, bp, p, M->p[0][0], M->n);
- c0 = mpn_addmul_1 (r[1].uvp[0], r[0].uvp[0], size, 2);
- c1 = mpn_addmul_1 (r[1].uvp[1], r[0].uvp[1], size, 2);
- d = 2;
- }
+ MPN_COPY (bp, t0, p);
+ bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n);
+ cy = mpn_sub (bp, bp, n, t1, p + M->n);
+ ASSERT (cy <= bh);
+ bh -= cy;
- /* FIXME: Can avoid branches */
- if (c1 != 0)
+ if (ah > 0 || bh > 0)
{
- r[1].uvp[0][size] = c0;
- r[1].uvp[1][size] = c1;
- size++;
+ ap[n] = ah;
+ bp[n] = bh;
+ n++;
}
else
{
- ASSERT (c0 == 0);
+ /* The subtraction can reduce the size by at most one limb. */
+ if (ap[n-1] == 0 && bp[n-1] == 0)
+ n--;
}
-
- /* Remains to adjust the quotient on stack */
- qstack_adjust (quotients, d);
-
- return size;
+ ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
+ return n;
}
-/* Reduce using Lehmer steps. Called by mpn_hgcd when r1 has been
- reduced to approximately the right size. Also used by
- mpn_hgcd_lehmer. */
-static int
-hgcd_final (struct hgcd *hgcd, mp_size_t M,
- struct qstack *quotients)
-{
- ASSERT (hgcd->row[0].rsize > M);
- ASSERT (hgcd->row[1].rsize > M);
-
- /* Can be equal when called by hgcd_lehmer. */
- ASSERT (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize,
- hgcd->row[0].rp, hgcd->row[0].rsize));
-
- for (;;)
- {
- mp_size_t L = hgcd->row[0].rsize;
-
- struct hgcd2 R;
- int res;
-
- if (L <= M + 2
- && (L < M + 2 || (hgcd->row[0].rp[M+1] & GMP_NUMB_HIGHBIT) == 0))
- break;
-
- res = mpn_hgcd2_lehmer_step (&R,
- hgcd->row[0].rp, hgcd->row[0].rsize,
- hgcd->row[1].rp, hgcd->row[1].rsize,
- quotients);
-
- if (res == 0)
- {
- /* We must divide to make progress */
- res = euclid_step (hgcd, M, quotients);
-
- if (res >= 0)
- return res;
- }
- else if (res == 1)
- {
- mp_size_t qsize;
-
- /* The quotient that has been computed for r2 is at most 2
- off. So adjust that, and avoid a full division. */
- qstack_drop (quotients);
-
- /* Top two rows of R must be the identity matrix, followed
- by a row (1, q). */
- ASSERT (R.row[0].u == 1 && R.row[0].v == 0);
- ASSERT (R.row[1].u == 0 && R.row[1].v == 1);
- ASSERT (R.row[2].u == 1);
-
- qsize = (R.row[2].v != 0);
-
- hgcd_update_r (hgcd->row, &R.row[2].v, qsize);
- hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
- &R.row[2].v, qsize);
- ASSERT (hgcd->size < hgcd->alloc);
-
- if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize,
- hgcd->row[2].rp, hgcd->row[2].rsize))
- hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients);
-
- ASSERT (hgcd->size < hgcd->alloc);
-
- hgcd->sign = ~hgcd->sign;
- HGCD_SWAP4_LEFT (hgcd->row);
- }
- else
- {
- const struct hgcd2_row *s = R.row + (res - 2);
- int sign = R.sign;
- /* Max size after reduction, plus one */
- mp_size_t ralloc = hgcd->row[1].rsize + 1;
-
- if (res == 2)
- {
- qstack_drop (quotients);
- qstack_drop (quotients);
- }
- else if (res == 3)
- {
- sign = ~sign;
- qstack_drop (quotients);
- }
-
- /* s[0] and s[1] correct. */
- hgcd->row[2].rsize
- = mpn_hgcd2_fix (hgcd->row[2].rp, ralloc,
- sign,
- s[0].u, hgcd->row[0].rp, hgcd->row[0].rsize,
- s[0].v, hgcd->row[1].rp, hgcd->row[1].rsize);
-
- hgcd->row[3].rsize
- = mpn_hgcd2_fix (hgcd->row[3].rp, ralloc,
- ~sign,
- s[1].u, hgcd->row[0].rp, hgcd->row[0].rsize,
- s[1].v, hgcd->row[1].rp, hgcd->row[1].rsize);
-
- hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc,
- s, hgcd->row, hgcd->size);
- hgcd->sign ^= sign;
-
- ASSERT (hgcd->row[2].rsize > M);
-
-#if WANT_ASSERT
- switch (res)
- {
- default:
- ASSERT_ALWAYS (0 == "Unexpected value of res");
- break;
- case 2:
- ASSERT (hgcd->row[2].rsize >= L - 1);
- ASSERT (hgcd->row[3].rsize >= L - 2);
- ASSERT (hgcd->row[2].rsize > M + 1);
- ASSERT (hgcd->row[3].rsize > M);
- break;
- case 3:
- ASSERT (hgcd->row[2].rsize >= L - 2);
- ASSERT (hgcd->row[3].rsize >= L - 2);
- ASSERT (hgcd->row[3].rsize > M);
- break;
- case 4:
- ASSERT (hgcd->row[2].rsize >= L - 2);
- ASSERT (hgcd->row[3].rsize < L || hgcd->row[3].rp[L-1] == 1);
- break;
- }
-#endif
- if (hgcd->row[3].rsize <= M)
- {
- /* Can happen only in the res == 4 case */
- ASSERT (res == 4);
-
- /* Backup two steps */
- ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size));
-
- return hgcd_small_2 (hgcd, M, quotients);
- }
-
- HGCD_SWAP4_2 (hgcd->row);
- }
- }
-
- ASSERT (hgcd->row[1].rsize > M);
-
- for (;;)
- {
-#if WANT_ASSERT
- mp_size_t L = hgcd->row[0].rsize;
-#endif
- mp_size_t ralloc;
-
- mp_size_t qsize;
- mp_srcptr qp;
-
- struct hgcd2 R;
- int res;
-
- /* We don't want hgcd2 to pickup any bits below r0p[M-1], so
- don't tell mpn_hgcd2_lehmer_step about them. */
- res = mpn_hgcd2_lehmer_step (&R,
- hgcd->row[0].rp+M-1, hgcd->row[0].rsize-M+1,
- hgcd->row[1].rp+M-1, hgcd->row[1].rsize-M+1,
- quotients);
- if (res == 0)
- {
- /* We must divide to make progress */
- res = euclid_step (hgcd, M, quotients);
-
- if (res >= 0)
- return res;
-
- continue;
- }
-
- if (res == 1)
- {
- mp_size_t qsize;
-
- /* The quotient that has been computed for r2 is at most 2
- off. So adjust that, and avoid a full division. */
- qstack_drop (quotients);
-
- /* Top two rows of R must be the identity matrix, followed
- by a row (1, q). */
- ASSERT (R.row[0].u == 1 && R.row[0].v == 0);
- ASSERT (R.row[1].u == 0 && R.row[1].v == 1);
- ASSERT (R.row[2].u == 1);
-
- qsize = (R.row[2].v != 0);
+/* Size analysis for hgcd:
- hgcd_update_r (hgcd->row, &R.row[2].v, qsize);
- hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
- &R.row[2].v, qsize);
- ASSERT (hgcd->size < hgcd->alloc);
+ For the recursive calls, we have n1 <= ceil(n / 2). Then the
+ storage need is determined by the storage for the recursive call
+ computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1
+ (after this, the storage needed for M1 can be recycled).
- if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize,
- hgcd->row[2].rp, hgcd->row[2].rsize))
- hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients);
+ Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1)
+ = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2,
+ and for the hgcd_matrix_mul, we may need 4 ceil(n/2) + 1. In total,
+ 4 * ceil(n/4) + 4 ceil(n/2) + 5 <= 12 ceil(n/4) + 5.
- ASSERT (hgcd->size < hgcd->alloc);
+ For the recursive call, we need S(n1) = S(ceil(n/2)).
- hgcd->sign = ~hgcd->sign;
- HGCD_SWAP4_LEFT (hgcd->row);
-
- continue;
- }
-
- /* Now r0 and r1 are always correct. */
- /* Store new values in rows 2 and 3, to avoid overlap */
-
- /* Max size after reduction, plus one */
- ralloc = hgcd->row[1].rsize + 1;
-
- hgcd->row[2].rsize
- = mpn_hgcd2_fix (hgcd->row[2].rp, ralloc,
- R.sign,
- R.row[0].u, hgcd->row[0].rp, hgcd->row[0].rsize,
- R.row[0].v, hgcd->row[1].rp, hgcd->row[1].rsize);
-
- hgcd->row[3].rsize
- = mpn_hgcd2_fix (hgcd->row[3].rp, ralloc,
- ~R.sign,
- R.row[1].u, hgcd->row[0].rp, hgcd->row[0].rsize,
- R.row[1].v, hgcd->row[1].rp, hgcd->row[1].rsize);
-
- ASSERT (hgcd->row[2].rsize >= L - 1);
- ASSERT (hgcd->row[3].rsize >= L - 2);
-
- ASSERT (hgcd->row[2].rsize > M);
- ASSERT (hgcd->row[3].rsize > M-1);
-
- hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc,
- R.row, hgcd->row, hgcd->size);
- hgcd->sign ^= R.sign;
-
- if (hgcd->row[3].rsize <= M)
- {
- /* Backup two steps */
-
- /* We don't use R.row[2] and R.row[3], so drop the
- corresponding quotients. */
- qstack_drop (quotients);
- qstack_drop (quotients);
-
- return hgcd_small_2 (hgcd, M, quotients);
- }
-
- HGCD_SWAP4_2 (hgcd->row);
-
- if (res == 2)
- {
- qstack_drop (quotients);
- qstack_drop (quotients);
-
- continue;
- }
-
- /* We already know the correct q for computing r2 */
-
- qsize = qstack_get_1 (quotients, &qp);
- ASSERT (qsize < 2);
-
- ASSERT (qsize + hgcd->size <= hgcd->alloc);
- hgcd_update_r (hgcd->row, qp, qsize);
- hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
- qp, qsize);
- ASSERT (hgcd->size < hgcd->alloc);
-
- ASSERT (hgcd->row[2].rsize >= M - 2);
-
- if (hgcd->row[2].rsize <= M)
- {
- /* Discard r3 */
- qstack_drop (quotients);
- return hgcd_small_1 (hgcd, M, quotients);
- }
- if (res == 3)
- {
- /* Drop quotient for r3 */
- qstack_drop (quotients);
-
- hgcd->sign = ~hgcd->sign;
- HGCD_SWAP4_LEFT (hgcd->row);
-
- continue;
- }
-
- ASSERT (res == 4);
- ASSERT (hgcd->row[2].rsize > M);
-
- /* We already know the correct q for computing r3 */
- qsize = qstack_get_0 (quotients, &qp);
- ASSERT (qsize < 2);
-
- ASSERT (qsize + hgcd->size <= hgcd->alloc);
- hgcd_update_r (hgcd->row + 1, qp, qsize);
- hgcd->size = hgcd_update_uv (hgcd->row + 1, hgcd->size,
- qp, qsize);
- ASSERT (hgcd->size < hgcd->alloc);
-
- ASSERT (hgcd->row[3].rsize <= M + 1);
- /* Appearantly not true. Probably because we have leading zeros
- when we call hgcd2. */
- /* ASSERT (hgcd->row[3].rsize <= M || hgcd->row[3].rp[M] == 1); */
-
- if (hgcd->row[3].rsize <= M)
- return hgcd_jebelean (hgcd, M);
-
- HGCD_SWAP4_2 (hgcd->row);
- }
-}
+ S(n) <= 12*ceil(n/4) + 5 + S(ceil(n/2))
+ <= 12*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 5k + S(ceil(n/2^k))
+ <= 12*(2 ceil(n/4) + k) + 5k + S(n/2^k)
+ <= 24 ceil(n/4) + 17k + S(n/2^k)
+
+*/
mp_size_t
-mpn_hgcd_itch (mp_size_t asize)
+mpn_hgcd_itch (mp_size_t n)
{
- /* Scratch space is needed for calling hgcd. We need space for the
- results of all recursive calls. In addition, we need space for
- calling hgcd_fix and hgcd_mul, for which N = asize limbs should
- be enough. */
+ unsigned k;
+ int count;
+ mp_size_t nscaled;
- /* Limit on the recursion depth */
- unsigned k = mpn_hgcd_max_recursion (asize);
+ if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
+ return MPN_HGCD_LEHMER_ITCH (n);
- return asize + mpn_hgcd_init_itch (asize + 6 * k) + 12 * k;
-}
+ /* Get the recursion depth. */
+ nscaled = (n - 1) / (HGCD_THRESHOLD - 1);
+ count_leading_zeros (count, nscaled);
+ k = GMP_LIMB_BITS - count;
-/* Repeatedly divides A by B, until the remainder fits in M =
- ceil(asize / 2) limbs. Stores cofactors in HGCD, and pushes the
- quotients on STACK. On success, HGCD->row[0, 1, 2] correspond to
- remainders that are larger than M limbs, while HGCD->row[3]
- correspond to a remainder that fit in M limbs.
-
- Return 0 on failure (if B or A mod B fits in M limbs), otherwise
- return one of 1 - 4 as specified for hgcd_jebelean. */
-int
-mpn_hgcd (struct hgcd *hgcd,
- mp_srcptr ap, mp_size_t asize,
- mp_srcptr bp, mp_size_t bsize,
- struct qstack *quotients,
- mp_ptr tp, mp_size_t talloc)
-{
- mp_size_t N = asize;
- mp_size_t M = (N + 1)/2;
- mp_size_t n;
- mp_size_t m;
-
- struct hgcd R;
- mp_size_t itch;
+ return 24 * ((n+3) / 4) + 17 * k
+ + MPN_HGCD_LEHMER_ITCH (HGCD_THRESHOLD);
+}
- ASSERT (M);
-#if WANT_TRACE
- trace ("hgcd: asize = %d, bsize = %d, HGCD_SCHOENHAGE_THRESHOLD = %d\n",
- asize, bsize, HGCD_SCHOENHAGE_THRESHOLD);
- if (asize < 100)
- trace (" a = %Nd\n"
- " b = %Nd\n", ap, asize, bp, bsize);
-#endif
+/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
+ with elements of size at most (n+1)/2 - 1. Returns new size of a,
+ b, or zero if no reduction is possible. */
- if (bsize <= M)
+mp_size_t
+mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n,
+ struct hgcd_matrix *M, mp_ptr tp)
+{
+ mp_size_t s = n/2 + 1;
+ mp_size_t n2 = (3*n)/4 + 1;
+
+ mp_size_t p, nn;
+ int success = 0;
+
+ if (n <= s)
+ /* Happens when n <= 2, a fairly uninteresting case but exercised
+ by the random inputs of the testsuite. */
return 0;
- ASSERT (asize >= 2);
-
- /* Initialize, we keep r0 and r1 as the reduced numbers (so far). */
- hgcd_start (hgcd, ap, asize, bp, bsize);
-
- if (BELOW_THRESHOLD (N, HGCD_SCHOENHAGE_THRESHOLD))
- return hgcd_final (hgcd, M, quotients);
+ ASSERT ((ap[n-1] | bp[n-1]) > 0);
- /* Reduce the size to M + m + 1. Usually, only one hgcd call is
- needed, but we may need multiple calls. When finished, the values
- are stored in r0 (potentially large) and r1 (smaller size) */
+ ASSERT ((n+1)/2 - 1 < M->alloc);
- n = N - M;
- m = (n + 1)/2;
+ if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
+ return mpn_hgcd_lehmer (ap, bp, n, M, tp);
- /* The second recursive call can use numbers of size up to n+3 */
- itch = mpn_hgcd_init_itch (n+3);
-
- ASSERT (itch <= talloc);
- mpn_hgcd_init (&R, n+3, tp);
- tp += itch; talloc -= itch;
-
- while (hgcd->row[1].rsize > M + m + 1)
+ p = n/2;
+ nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
+ if (nn > 0)
{
- /* Max size after reduction, plus one */
- mp_size_t ralloc = hgcd->row[1].rsize + 1;
-
- int res = mpn_hgcd (&R,
- hgcd->row[0].rp + M, hgcd->row[0].rsize - M,
- hgcd->row[1].rp + M, hgcd->row[1].rsize - M,
- quotients, tp, talloc);
-
- if (res == 0)
- {
- /* We must divide to make progress */
- res = euclid_step (hgcd, M, quotients);
-
- if (res > 0)
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4);
- if (res >= 0)
- return res;
-
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2);
- }
- else if (res <= 2)
- {
- /* The reason we use hgcd_adjust also when res == 2 is that
- either r2 is correct, and we get it for free.
-
- Or r2 is too large. Then can correct it by a few bignum
- subtractions, and we are *guaranteed* that the result is
- small enough that we don't need another run through this
- loop. */
-
- /* FIXME: For res == 1, the newly computed row[2] will be
- the same as the old row[1], so we do some unnecessary
- computations. */
-
- qstack_drop (quotients);
-
- /* Store new values in rows 2 and 3, to avoid overlap */
- hgcd->row[2].rsize
- = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc,
- ~R.sign, R.size, &R.row[1],
- hgcd->row[0].rp, hgcd->row[1].rp,
- tp, talloc);
-
- hgcd->row[3].rsize
- = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc,
- R.sign, R.size, &R.row[2],
- hgcd->row[0].rp, hgcd->row[1].rp,
- tp, talloc);
-
- ASSERT (hgcd->row[2].rsize > M);
- ASSERT (hgcd->row[3].rsize > M);
-
- /* Computes the uv matrix for the (possibly incorrect)
- values r1, r2. The elements must be smaller than the
- correct ones, since they correspond to a too small q. */
-
- hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc,
- R.row + 1, R.size,
- hgcd->row, hgcd->size,
- tp, talloc);
- hgcd->sign ^= ~R.sign;
-
- if (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize,
- hgcd->row[2].rp, hgcd->row[2].rsize))
- {
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4);
-
- HGCD_SWAP4_2 (hgcd->row);
- }
- else
- {
- /* r2 was too large, i.e. q0 too small. In this case we
- must have r2 % r1 <= r2 - r1 smaller than M + m + 1. */
-
- hgcd->size = hgcd_adjust (hgcd->row + 2, hgcd->size, quotients);
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4);
-
- ASSERT (hgcd->row[3].rsize <= M + m + 1);
-
- if (hgcd->row[3].rsize <= M)
- {
- /* Backup two steps */
- ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size));
-
- return hgcd_small_2 (hgcd, M, quotients);
- }
-
- HGCD_SWAP4_2 (hgcd->row);
-
- /* Loop always terminates here. */
- break;
- }
- }
- else if (res == 3)
- {
- qstack_drop(quotients);
-
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2);
-
- /* Store new values in rows 2 and 3, to avoid overlap */
- hgcd->row[2].rsize
- = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc,
- ~R.sign, R.size, &R.row[1],
- hgcd->row[0].rp, hgcd->row[1].rp,
- tp, talloc);
-
- hgcd->row[3].rsize
- = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc,
- R.sign, R.size, &R.row[2],
- hgcd->row[0].rp, hgcd->row[1].rp,
- tp, talloc);
-
- ASSERT (hgcd->row[2].rsize > M);
- ASSERT (hgcd->row[3].rsize > M);
-
- hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc,
- R.row + 1, R.size,
- hgcd->row, hgcd->size,
- tp, talloc);
- hgcd->sign ^= ~R.sign;
-
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4);
-
- HGCD_SWAP4_2 (hgcd->row);
- }
- else
- {
- ASSERT (res == 4);
-
- /* All of r0, r1, r3 and r3 are correct.
- Compute r2 and r3 */
-
- ASSERT_HGCD (&R,
- hgcd->row[0].rp + M, hgcd->row[0].rsize - M,
- hgcd->row[1].rp + M, hgcd->row[1].rsize - M,
- 0, 4);
-
- /* Store new values in rows 2 and 3, to avoid overlap */
- hgcd->row[2].rsize
- = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc,
- R.sign, R.size, &R.row[2],
- hgcd->row[0].rp, hgcd->row[1].rp,
- tp, talloc);
-
- hgcd->row[3].rsize
- = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc,
- ~R.sign, R.size, &R.row[3],
- hgcd->row[0].rp, hgcd->row[1].rp,
- tp, talloc);
-
- ASSERT (hgcd->row[2].rsize > M);
- ASSERT (hgcd->row[3].rsize <= M + m + 1);
-
- hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc,
- R.row+2, R.size,
- hgcd->row, hgcd->size,
- tp, talloc);
- hgcd->sign ^= R.sign;
-
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4);
-
- if (hgcd->row[3].rsize <= M)
- {
- /* Backup two steps */
- /* Both steps must always be possible, but it's not
- trivial to ASSERT that here. */
- ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size));
-
- return hgcd_small_2 (hgcd, M, quotients);
- }
- HGCD_SWAP4_2 (hgcd->row);
-
- /* Always exit the loop. */
- break;
- }
+ /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
+ = 2 (n - 1) */
+ n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
+ success = 1;
}
-
- ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize);
- ASSERT (hgcd->row[1].rsize > M);
- ASSERT (hgcd->row[1].rsize <= M + m + 1);
-
- if (hgcd->row[0].rsize > M + m + 1)
+ while (n > n2)
{
- /* One euclid step to reduce size. */
- int res = euclid_step (hgcd, M, quotients);
-
- if (res > 0)
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4);
- if (res >= 0)
- return res;
-
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2);
+ /* Needs n + 1 storage */
+ nn = hgcd_step (n, ap, bp, s, M, tp);
+ if (!nn)
+ return success ? n : 0;
+ n = nn;
+ success = 1;
}
- ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize);
- ASSERT (hgcd->row[0].rsize <= M + m + 1);
- ASSERT (hgcd->row[1].rsize > M);
-
- /* Second phase, reduce size until we have one number of size > M
- and one of size <= M+1 */
- while (hgcd->row[1].rsize > M + 1)
+ if (n > s + 2)
{
- mp_size_t k = 2*M - hgcd->row[0].rsize;
-#if WANT_ASSERT
- mp_size_t n1 = hgcd->row[0].rsize - k;
-#endif
- mp_size_t qsize;
- mp_srcptr qp;
- int res;
-
- ASSERT (k + (n1 + 1)/2 == M);
- ASSERT (n1 >= 2);
-
- ASSERT (n1 <= 2*(m + 1));
- ASSERT (n1 <= n + 3);
-
- res = mpn_hgcd (&R,
- hgcd->row[0].rp + k, hgcd->row[0].rsize - k,
- hgcd->row[1].rp + k, hgcd->row[1].rsize - k,
- quotients, tp, talloc);
-
- if (res == 0)
- {
- /* The first remainder was small. Then there's a good chance
- that the remainder A % B is also small. */
- res = euclid_step (hgcd, M, quotients);
+ struct hgcd_matrix M1;
+ mp_size_t scratch;
- if (res > 0)
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4);
- if (res >= 0)
- return res;
+ p = 2*s - n + 1;
+ scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2);
- continue;
- }
-
- if (res == 1)
+ mpn_hgcd_matrix_init(&M1, n - p, tp);
+ nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch);
+ if (nn > 0)
{
- mp_srcptr qp;
- mp_size_t qsize;
-
- qstack_drop (quotients);
-
- /* Compute possibly incorrect r2 and corresponding u2, v2.
- Incorrect matrix elements must be smaller than the
- correct ones, since they correspond to a too small q. */
- qsize = qstack_get_0 (quotients, &qp);
-
- ASSERT (qsize + hgcd->size <= hgcd->alloc);
- hgcd_update_r (hgcd->row, qp, qsize);
- hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
- qp, qsize);
- ASSERT (hgcd->size < hgcd->alloc);
-
- if (!MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize,
- hgcd->row[2].rp, hgcd->row[2].rsize))
- hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients);
-
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 3);
-
- if (hgcd->row[2].rsize <= M)
- {
- /* Backup one steps */
- ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size));
-
- return hgcd_small_1 (hgcd, M, quotients);
- }
-
- HGCD_SWAP4_LEFT (hgcd->row);
- hgcd->sign = ~hgcd->sign;
- continue;
+ /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
+ ASSERT (M->n + 2 >= M1.n);
+
+ /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
+ then either q or q + 1 is a correct quotient, and M1 will
+ start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
+ rules out the case that the size of M * M1 is much
+ smaller than the expected M->n + M1->n. */
+
+ ASSERT (M->n + M1.n < M->alloc);
+
+ /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
+ = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
+ n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
+ /* Needs 4 ceil(n/2) + 1 */
+ mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
+ success = 1;
}
-
- /* Now r0 and r1 are always correct. */
-
- /* It's possible that first two "new" r:s are the same as the
- old ones. In that case skip recomputing them. */
-
- if (!hgcd_start_row_p (&R.row[0], R.size))
- {
- /* Store new values in rows 2 and 3, to avoid overlap */
- hgcd->row[2].rsize
- = mpn_hgcd_fix (k, hgcd->row[2].rp, hgcd->row[0].rsize + 1,
- R.sign, R.size, &R.row[0],
- hgcd->row[0].rp, hgcd->row[1].rp,
- tp, talloc);
-
- hgcd->row[3].rsize
- = mpn_hgcd_fix (k, hgcd->row[3].rp, hgcd->row[1].rsize + 1,
- ~R.sign, R.size, &R.row[1],
- hgcd->row[0].rp, hgcd->row[1].rp,
- tp, talloc);
-
- ASSERT (hgcd->row[2].rsize > M);
- ASSERT (hgcd->row[3].rsize > k);
-
- hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc,
- R.row, R.size, hgcd->row, hgcd->size,
- tp, talloc);
- hgcd->sign ^= R.sign;
-
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4);
-
- if (hgcd->row[3].rsize <= M)
- {
- /* Backup two steps */
-
- /* We don't use R.row[2] and R.row[3], so drop the
- corresponding quotients. */
- qstack_drop (quotients);
- qstack_drop (quotients);
-
- return hgcd_small_2 (hgcd, M, quotients);
- }
-
- HGCD_SWAP4_2 (hgcd->row);
-
- if (res == 2)
- {
- qstack_drop (quotients);
- qstack_drop (quotients);
-
- continue;
- }
- }
-
- ASSERT (res >= 3);
-
- /* We already know the correct q */
- qsize = qstack_get_1 (quotients, &qp);
-
- ASSERT (qsize + hgcd->size <= hgcd->alloc);
- hgcd_update_r (hgcd->row, qp, qsize);
- hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
- qp, qsize);
- ASSERT (hgcd->size < hgcd->alloc);
-
- ASSERT (hgcd->row[2].rsize > k);
- if (hgcd->row[2].rsize <= M)
- {
- /* Discard r3 */
- qstack_drop (quotients);
- return hgcd_small_1 (hgcd, M, quotients);
- }
- if (res == 3)
- {
- /* Drop quotient for r3 */
- qstack_drop (quotients);
- hgcd->sign = ~hgcd->sign;
- HGCD_SWAP4_LEFT (hgcd->row);
-
- continue;
- }
-
- ASSERT (hgcd->row[2].rsize > M);
- ASSERT (res == 4);
-
- /* We already know the correct q */
- qsize = qstack_get_0 (quotients, &qp);
-
- ASSERT (qsize + hgcd->size <= hgcd->alloc);
- hgcd_update_r (hgcd->row + 1, qp, qsize);
- hgcd->size = hgcd_update_uv (hgcd->row + 1, hgcd->size,
- qp, qsize);
- ASSERT (hgcd->size < hgcd->alloc);
- ASSERT (hgcd->row[3].rsize <= M + 1);
-
- if (hgcd->row[3].rsize <= M)
- {
-#if WANT_ASSERT
- qstack_rotate (quotients, 0);
-#endif
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4);
- return hgcd_jebelean (hgcd, M);
- }
-
- HGCD_SWAP4_2 (hgcd->row);
}
- ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2);
+ /* This really is the base case */
+ for (;;)
+ {
+ /* Needs s+3 < n */
+ nn = hgcd_step (n, ap, bp, s, M, tp);
+ if (!nn)
+ return success ? n : 0;
- return hgcd_final (hgcd, M, quotients);
+ n = nn;
+ success = 1;
+ }
}
diff --git a/mpn/generic/hgcd2.c b/mpn/generic/hgcd2.c
index 4ce579e8a..df6b94025 100644
--- a/mpn/generic/hgcd2.c
+++ b/mpn/generic/hgcd2.c
@@ -89,506 +89,201 @@ div2 (mp_ptr rp,
return q;
}
#else /* GMP_NAIL_BITS != 0 */
-/* Two-limb division optimized for small quotients. Input words
- include nails, which must be zero. */
-static inline mp_limb_t
-div2 (mp_ptr rp,
- mp_limb_t nh, mp_limb_t nl,
- mp_limb_t dh, mp_limb_t dl)
+/* Check all functions for nail support. */
+/* hgcd2 should be defined to take inputs including nail bits, and
+ produce a matrix with elements also including nail bits. This is
+ necessary, for the matrix elements to be useful with mpn_mul_1,
+ mpn_addmul_1 and friends. */
+#error Not implemented
+#endif /* GMP_NAIL_BITS != 0 */
+
+/* Reduces a,b until |a-b| fits in one limb + 1 bit. Constructs
+ matrix M. Returns 1 if we make progress, i.e. can perform at least
+ one subtraction. Otherwise returns zero.. */
+
+/* FIXME: Possible optimizations:
+
+ The div2 function starts with checking the most significant bit of
+ the numerator. We can maintained normalized operands here, call
+ hgcd with normalized operands only, which should make the code
+ simpler and possibly faster.
+
+ Experiment with table lookups on the most significant bits.
+
+ This function is also a candidate for assembler implementation.
+*/
+int
+mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
+ struct hgcd_matrix1 *M)
{
- mp_limb_t q = 0;
- int cnt;
-
- ASSERT_LIMB(nh);
- ASSERT_LIMB(nl);
- ASSERT_LIMB(dh);
- ASSERT_LIMB(dl);
-
- /* FIXME: Always called with nh > 0 and dh >0. Then it should be
- enough to look at the high limbs to select cnt. */
- for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++)
- {
- dh = (dh << 1) | (dl >> (GMP_NUMB_BITS - 1));
- dl = (dl << 1) & GMP_NUMB_MASK;
- }
-
- while (cnt)
+ mp_limb_t u00, u01, u10, u11;
+
+ if (ah < 2 || bh < 2)
+ return 0;
+
+ if (ah > bh || (ah == bh && al > bl))
{
- dl = (dh << (GMP_NUMB_BITS - 1)) | (dl >> 1);
- dh = dh >> 1;
- dl &= GMP_NUMB_MASK;
-
- q <<= 1;
- if (nh > dh || (nh == dh && nl >= dl))
- {
- /* FIXME: We could perhaps optimize this by unrolling the
- loop 2^GMP_NUMB_BITS - 1 times? */
- nl -= dl;
- nh -= dh;
- nh -= (nl >> (GMP_LIMB_BITS - 1));
- nl &= GMP_NUMB_MASK;
-
- q |= 1;
- }
- cnt--;
+ sub_ddmmss (ah, al, ah, al, bh, bl);
+ if (ah < 2)
+ return 0;
+
+ u00 = u01 = u11 = 1;
+ u10 = 0;
}
- ASSERT (nh < dh || (nh == dh && nl < dl));
- rp[0] = nl;
- rp[1] = nh;
+ else
+ {
+ sub_ddmmss (bh, bl, bh, bl, ah, al);
+ if (bh < 2)
+ return 0;
- return q;
-}
-#endif /* GMP_NAIL_BITS != 0 */
+ u00 = u10 = u11 = 1;
+ u01 = 0;
+ }
-#define SUB_2(w1,w0, x1,x0, y1,y0) \
- do { \
- ASSERT_LIMB (x1); \
- ASSERT_LIMB (x0); \
- ASSERT_LIMB (y1); \
- ASSERT_LIMB (y0); \
- \
- if (GMP_NAIL_BITS == 0) \
- sub_ddmmss (w1,w0, x1,x0, y1,y0); \
- else \
- { \
- mp_limb_t __w0, __c; \
- SUBC_LIMB (__c, __w0, x0, y0); \
- (w1) = ((x1) - (y1) - __c) & GMP_NUMB_MASK; \
- (w0) = __w0; \
- } \
- } while (0)
-
-static inline void
-qstack_push_0 (struct qstack *stack)
-{
- ASSERT_QSTACK (stack);
+ if (ah < bh)
+ goto subtract_a;
- if (stack->size_next >= QSTACK_MAX_QUOTIENTS)
- qstack_rotate (stack, 0);
+ for (;;)
+ {
+ ASSERT (ah >= bh);
+ if (ah == bh)
+ break;
- stack->size[stack->size_next++] = 0;
-}
+ /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
+ 1), affecting the second column of M. */
+ ASSERT (ah > bh);
+ sub_ddmmss (ah, al, ah, al, bh, bl);
-static inline void
-qstack_push_1 (struct qstack *stack, mp_limb_t q)
-{
- ASSERT (q >= 2);
+ if (ah < 2)
+ break;
- ASSERT_QSTACK (stack);
+ if (ah <= bh)
+ {
+ /* Use q = 1 */
+ u01 += u00;
+ u11 += u10;
+ }
+ else
+ {
+ mp_limb_t r[2];
+ mp_limb_t q = div2 (r, ah, al, bh, bl);
+ al = r[0]; ah = r[1];
+ if (ah < 2)
+ {
+ /* A is too small, but q is correct. */
+ u01 += q * u00;
+ u11 += q * u10;
+ break;
+ }
+ q++;
+ u01 += q * u00;
+ u11 += q * u10;
+ }
+ subtract_a:
+ ASSERT (bh >= ah);
+ if (ah == bh)
+ break;
- if (stack->limb_next >= stack->limb_alloc)
- qstack_rotate (stack, 1);
+ /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
+ 1), affecting the first column of M. */
+ sub_ddmmss (bh, bl, bh, bl, ah, al);
- else if (stack->size_next >= QSTACK_MAX_QUOTIENTS)
- qstack_rotate (stack, 0);
+ if (bh < 2)
+ break;
- stack->size[stack->size_next++] = 1;
- stack->limb[stack->limb_next++] = q;
+ if (bh <= ah)
+ {
+ /* Use q = 1 */
+ u00 += u01;
+ u10 += u11;
+ }
+ else
+ {
+ mp_limb_t r[2];
+ mp_limb_t q = div2 (r, bh, bl, ah, al);
+ bl = r[0]; bh = r[1];
+ if (bh < 2)
+ {
+ /* B is too small, but q is correct. */
+ u00 += q * u01;
+ u10 += q * u11;
+ break;
+ }
+ q++;
+ u00 += q * u01;
+ u10 += q * u11;
+ }
+ }
+ M->u[0][0] = u00; M->u[0][1] = u01;
+ M->u[1][0] = u10; M->u[1][1] = u11;
- ASSERT_QSTACK (stack);
+ return 1;
}
-/* Produce r_k from r_i and r_j, and push the corresponding
- quotient. */
-#if __GMP_HAVE_TOKEN_PASTE
-#define HGCD2_STEP(i, j, k) do { \
- SUB_2 (rh ## k, rl ## k, \
- rh ## i, rl ## i, \
- rh ## j, rl ## j); \
- \
- /* Could check here for the special case rh3 == 0, \
- but it's covered by the below condition as well */ \
- if ( rh ## k < rh ## j \
- || ( rh ## k == rh ## j \
- && rl ## k < rl ## j)) \
- { \
- /* Unit quotient */ \
- u ## k = u ## i + u ## j; \
- v ## k = v ## i + v ## j; \
- \
- if (quotients) \
- qstack_push_0 (quotients); \
- } \
- else \
- { \
- mp_limb_t r[2]; \
- mp_limb_t q = 1 + div2 (r, rh ## k, rl ## k, \
- rh ## j, rl ## j); \
- rl ## k = r[0]; rh ## k = r[1]; \
- u ## k = u ## i + q * u ## j; \
- v ## k = v ## i + q * v ## j; \
- \
- if (quotients) \
- qstack_push_1 (quotients, q); \
- } \
-} while (0)
-#else /* ! __GMP_HAVE_TOKEN_PASTE */
-#define HGCD2_STEP(i, j, k) do { \
- SUB_2 (rh/**/k, rl/**/k, \
- rh/**/i, rl/**/i, \
- rh/**/j, rl/**/j); \
- \
- /* Could check here for the special case rh3 == 0, \
- but it's covered by the below condition as well */ \
- if ( rh/**/k < rh/**/j \
- || ( rh/**/k == rh/**/j \
- && rl/**/k < rl/**/j)) \
- { \
- /* Unit quotient */ \
- u/**/k = u/**/i + u/**/j; \
- v/**/k = v/**/i + v/**/j; \
- \
- if (quotients) \
- qstack_push_0 (quotients); \
- } \
- else \
- { \
- mp_limb_t r[2]; \
- mp_limb_t q = 1 + div2 (r, rh/**/k, rl/**/k, \
- rh/**/j, rl/**/j); \
- rl/**/k = r[0]; rh/**/k = r[1]; \
- u/**/k = u/**/i + q * u/**/j; \
- v/**/k = v/**/i + q * v/**/j; \
- \
- if (quotients) \
- qstack_push_1 (quotients, q); \
- } \
-} while (0)
-#endif /* ! __GMP_HAVE_TOKEN_PASTE */
-
-/* Repeatedly divides A by B, until the remainder is a single limb.
- Stores cofactors in HGCD, and pushes the quotients on STACK (unless
- STACK is NULL). On success, HGCD->row[0, 1, 2] correspond to
- remainders that are larger than one limb, while HGCD->row[3]
- correspond to a remainder that fit in a single limb.
-
- Return 0 on failure (if B or A mod B fits in a single limb). Return
- 1 if r0 and r1 are correct, but we still make no progress because
- r0 = A, r1 = B.
-
- Otherwise return 2, 3 or 4 depending on how many of the r:s that
- satisfy Jebelean's criterion. */
-/* FIXME: There are two more micro optimizations that could be done to
- this code:
+/* Multiply (a;b) by M = (u00, u01; u10, u11). Needs n limbs of
+ temporary storage. Vector must have space for n + 1 limbs. */
+mp_size_t
+mpn_hgcd_mul_matrix1_vector (struct hgcd_matrix1 *M, mp_size_t n,
+ mp_ptr ap, mp_ptr bp, mp_ptr tp)
+{
+ mp_limb_t ah, bh;
- The div2 function starts with checking the most significant bit of
- the numerator. When we call div2, that bit is know in advance for
- all but the one or two first calls, so we could split div2 in two
- functions, and call the right one.
+ /* Compute (a,b) <-- (u00 a + u10 b, u01 a + u11 b) as
- We could also have two versions of this code, with and without the
- quotient argument, to avoid checking if it's NULL in the middle of
- the loop. */
+ t = a
+ a *= u00
+ a += u10 * b
+ b *= u11
+ b += u01 * t
+ */
-int
-mpn_hgcd2 (struct hgcd2 *hgcd,
- mp_limb_t ah, mp_limb_t al,
- mp_limb_t bh, mp_limb_t bl,
- struct qstack *quotients)
-{
- /* For all divisions, we special case q = 1, which accounts for
- approximately 41% of the quotients for random numbers (Knuth,
- TAOCP 4.5.3) */
-
- /* Use scalar variables */
- mp_limb_t rh1, rl1, u1, v1;
- mp_limb_t rh2, rl2, u2, v2;
- mp_limb_t rh3, rl3, u3, v3;
-
- ASSERT_LIMB(ah);
- ASSERT_LIMB(al);
- ASSERT_LIMB(bh);
- ASSERT_LIMB(bl);
- ASSERT (ah > bh || (ah == bh && al >= bl));
-
- if (bh == 0)
- return 0;
+ /* This copying could be avoided if we let our caller swap some
+ * pointers. */
+ MPN_COPY (tp, ap, n);
- {
- mp_limb_t rh0, rl0, u0, v0;
-
- /* Initialize first two rows */
- rh0 = ah; rl0 = al; u0 = 1; v0 = 0;
- rh1 = bh; rl1 = bl; u1 = 0; v1 = 1;
-
- SUB_2 (rh2, rl2, rh0, rl0, rh1, rl1);
-
- if (rh2 == 0)
- return 0;
-
- if (rh2 < rh1 || (rh2 == rh1 && rl2 < rl1))
- {
- /* Unit quotient */
- v2 = 1;
-
- if (quotients)
- qstack_push_0 (quotients);
- }
- else
- {
- mp_limb_t r[2];
- mp_limb_t q = 1 + div2 (r, rh2, rl2, rh1, rl1);
-
- rl2 = r[0]; rh2 = r[1];
-
- if (rh2 == 0)
- return 0;
-
- v2 = q;
-
- if (quotients)
- qstack_push_1 (quotients, q);
- }
-
- u2 = 1;
-
- /* The simple version of the loop is as follows:
- |
- | hgcd->sign = 0;
- | for (;;)
- | {
- | (q, rh3, rl3]) = divmod (r1, r2);
- | u[3] = u1 + q * u2;
- | v[3] = v1 + q * v2;
- | qstack_push_1 (quotients, q);
- |
- | if (rh3 == 0)
- | break;
- |
- | HGCD2_SHIFT4_LEFT (hgcd->row);
- | hgcd->sign = ~hgcd->sign;
- | }
- |
- | But then we special case for q = 1, and unroll the loop four times
- | to avoid data movement. */
-
- for (;;)
- {
- HGCD2_STEP (1, 2, 3);
- if (rh3 == 0)
- {
- hgcd->row[0].u = u0; hgcd->row[0].v = v0;
-
- hgcd->sign = 0;
-
- break;
- }
- HGCD2_STEP (2, 3, 0);
- if (rh0 == 0)
- {
- hgcd->row[0].u = u1; hgcd->row[0].v = v1;
-
- rh1 = rh2; rl1 = rl2; u1 = u2; v1 = v2;
- rh2 = rh3; rl2 = rl3; u2 = u3; v2 = v3;
- rh3 = rh0; rl3 = rl0; u3 = u0; v3 = v0;
-
- hgcd->sign = -1;
- break;
- }
-
- HGCD2_STEP (3, 0, 1);
- if (rh1 == 0)
- {
- hgcd->row[0].u = u2; hgcd->row[0].v = v2;
- rh2 = rh0; rl2 = rl0; u2 = u0; v2 = v0;
-
- MP_LIMB_T_SWAP (rh1, rh3); MP_LIMB_T_SWAP (rl1, rl3);
- MP_LIMB_T_SWAP ( u1, u3); MP_LIMB_T_SWAP ( v1, v3);
-
- hgcd->sign = 0;
- break;
- }
-
- HGCD2_STEP (0, 1, 2);
- if (rh2 == 0)
- {
- hgcd->row[0].u = u3; hgcd->row[0].v = v3;
-
- rh3 = rh2; rl3 = rl2; u3 = u2; v3 = v2;
- rh2 = rh1; rl2 = rl1; u2 = u1; v2 = v1;
- rh1 = rh0; rl1 = rl0; u1 = u0; v1 = v0;
-
- hgcd->sign = -1;
- break;
- }
- }
- }
-
- ASSERT (rh1 != 0);
- ASSERT (rh2 != 0);
- ASSERT (rh3 == 0);
- ASSERT (rh1 > rh2 || (rh1 == rh2 && rl1 > rl2));
- ASSERT (rh2 > rh3 || (rh2 == rh3 && rl2 > rl3));
-
- /* Coefficients to be returned */
- hgcd->row[1].u = u1; hgcd->row[1].v = v1;
- hgcd->row[2].u = u2; hgcd->row[2].v = v2;
- hgcd->row[3].u = u3; hgcd->row[3].v = v3;
-
- /* Rows 1, 2 and 3 are used below, rh0, rl0, u0 and v0 are not. */
-#if GMP_NAIL_BITS == 0
- {
- mp_limb_t sh;
- mp_limb_t sl;
- mp_limb_t th;
- mp_limb_t tl;
-
- /* Check r2 */
- /* We always have r2 > u2, v2 */
-
- if (hgcd->sign >= 0)
- {
- /* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */
- sl = u2 + u1;
- sh = (sl < u1);
- }
- else
- {
- /* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */
- sl = v2 + v1;
- sh = (sl < v1);
- }
-
- sub_ddmmss (th, tl, rh1, rl1, rh2, rl2);
-
- if (th < sh || (th == sh && tl < sl))
- return 2 - (hgcd->row[0].v == 0);
-
- /* Check r3 */
-
- if (hgcd->sign >= 0)
- {
- /* Check r3 >= max (-u3, -v3) = |u3| */
- if (rl3 < u3)
- return 3;
-
- /* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/
- sl = v3 + v2;
- sh = (sl < v2);
- }
- else
- {
- /* Check r3 >= max (-u3, -v3) = |v3| */
- if (rl3 < v3)
- return 3;
-
- /* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */
- sl = u3 + u2;
- sh = (sl < u2);
- }
-
- sub_ddmmss (th, tl, rh2, rl2, 0, rl3);
-
- if (th < sh || (th == sh && tl < sl))
- return 3;
-
- return 4;
- }
-#else /* GMP_NAIL_BITS > 0 */
- {
- mp_limb_t sl;
- mp_limb_t th;
- mp_limb_t tl;
-
- /* Check r2 */
- /* We always have r2 > u2, v2 */
-
- if (hgcd->sign >= 0)
- {
- /* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */
- sl = u2 + u1;
- }
- else
- {
- /* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */
- sl = v2 + v1;
- }
-
- tl = rl1 - rl2;
- th = rh1 - rh2 - (tl >> (GMP_LIMB_BITS - 1));
- ASSERT_LIMB(th);
-
- if (th < (CNST_LIMB(1) << GMP_NAIL_BITS)
- && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl)
- return 2 - (hgcd->row[0].v == 0);
-
- /* Check r3 */
-
- if (hgcd->sign >= 0)
- {
- /* Check r3 >= max (-u3, -v3) = |u3| */
- if (rl3 < u3)
- return 3;
-
- /* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/
- sl = v3 + v2;
- }
- else
- {
- /* Check r3 >= max (-u3, -v3) = |v3| */
- if (rl3 < v3)
- return 3;
-
- /* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */
- sl = u3 + u2;
- }
-
- tl = rl2 - rl3;
- th = rh2 - (tl >> (GMP_LIMB_BITS - 1));
- ASSERT_LIMB(th);
-
- if (th < (CNST_LIMB(1) << GMP_NAIL_BITS)
- && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl)
- return 3;
-
- return 4;
- }
-#endif /* GMP_NAIL_BITS > 0 */
+ ah = mpn_mul_1 (ap, ap, n, M->u[0][0]);
+ ah += mpn_addmul_1 (ap, bp, n, M->u[1][0]);
+
+ bh = mpn_mul_1 (bp, bp, n, M->u[1][1]);
+ bh += mpn_addmul_1 (bp, tp, n, M->u[0][1]);
+
+ ap[n] = ah;
+ bp[n] = bh;
+
+ n += (ap[n] | bp[n]) > 0;
+ return n;
}
+/* Multiply (a;b) by M^{-1} = (u11, -u01; -u10, u00) from the left.
+ Needs n limbs of temporary storage. */
mp_size_t
-mpn_hgcd2_fix (mp_ptr rp, mp_size_t ralloc,
- int sign,
- mp_limb_t u, mp_srcptr ap, mp_size_t asize,
- mp_limb_t v, mp_srcptr bp, mp_size_t bsize)
+mpn_hgcd_mul_matrix1_inverse_vector (struct hgcd_matrix1 *M, mp_size_t n,
+ mp_ptr ap, mp_ptr bp, mp_ptr tp)
{
- mp_size_t rsize;
- mp_limb_t cy;
+ mp_limb_t h0, h1;
- ASSERT_LIMB(u);
- ASSERT_LIMB(v);
+ /* Compute (a;b) <-- (u11 a - u01 b; -u10 a + u00 b) as
- if (sign < 0)
- {
- MP_LIMB_T_SWAP (u,v);
- MPN_SRCPTR_SWAP (ap, asize, bp, bsize);
- }
+ t = a
+ a *= u11
+ a -= u01 * b
+ b *= u00
+ b -= u10 * t
+ */
- ASSERT (u > 0);
+ /* This copying could be avoided if we let our caller swap some
+ * pointers. */
+ MPN_COPY (tp, ap, n);
- ASSERT (asize <= ralloc);
- rsize = asize;
- cy = mpn_mul_1 (rp, ap, asize, u);
- if (cy)
- {
- ASSERT (rsize < ralloc);
- rp[rsize++] = cy;
- }
+ h0 = mpn_mul_1 (ap, ap, n, M->u[1][1]);
+ h1 = mpn_submul_1 (ap, bp, n, M->u[0][1]);
+ ASSERT (h0 == h1);
- if (v > 0)
- {
- ASSERT (bsize <= rsize);
- cy = mpn_submul_1 (rp, bp, bsize, v);
- if (cy)
- {
- ASSERT (bsize < rsize);
- ASSERT_NOCARRY (mpn_sub_1 (rp + bsize,
- rp + bsize, rsize - bsize, cy));
- }
+ h0 = mpn_mul_1 (bp, bp, n, M->u[0][0]);
+ h1 = mpn_submul_1 (bp, tp, n, M->u[1][0]);
+ ASSERT (h0 == h1);
- MPN_NORMALIZE (rp, rsize);
- }
- return rsize;
+ n -= (ap[n-1] | bp[n-1]) == 0;
+ return n;
}
-#undef HGCD2_STEP
diff --git a/mpn/generic/matrix22_mul.c b/mpn/generic/matrix22_mul.c
new file mode 100644
index 000000000..0b8b61303
--- /dev/null
+++ b/mpn/generic/matrix22_mul.c
@@ -0,0 +1,254 @@
+/* matrix22_mul.c.
+
+ THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
+ SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
+ GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define MUL(rp, ap, an, bp, bn) do { \
+ if (an >= bn) \
+ mpn_mul (rp, ap, an, bp, bn); \
+ else \
+ mpn_mul (rp, bp, bn, ap, an); \
+} while (0)
+
+/* Inputs are unsigned. */
+static int
+abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
+{
+ int c;
+ MPN_CMP (c, ap, bp, n);
+ if (c >= 0)
+ {
+ mpn_sub_n (rp, ap, bp, n);
+ return 0;
+ }
+ else
+ {
+ mpn_sub_n (rp, bp, ap, n);
+ return 1;
+ }
+}
+
+static int
+add_signed_n (mp_ptr rp,
+ mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
+{
+ if (as != bs)
+ return as ^ abs_sub_n (rp, ap, bp, n);
+ else
+ {
+ ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
+ return as;
+ }
+}
+
+mp_size_t
+mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
+{
+ if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
+ || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
+ return 3*rn + 2*mn;
+ else
+ return 4*(rn + mn) + 5;
+}
+
+/* Algorithm:
+
+ / s0 \ / 1 0 0 0 \ / r0 \
+ | s1 | | 0 1 0 0 | | r1 |
+ | s2 | | 0 0 1 1 | | r2 |
+ | s3 | = | -1 0 1 1 | \ r3 /
+ | s4 | | 1 0 -1 0 |
+ | s5 | | 1 1 -1 -1 |
+ \ s6 / \ 0 0 0 1 /
+
+ / t0 \ / 1 0 0 0 \ / m0 \
+ | t1 | | 0 0 1 0 | | m1 |
+ | t2 | | -1 1 0 0 | | m2 |
+ | t3 | = | 1 -1 0 1 | \ m3 /
+ | t4 | | 0 -1 0 1 |
+ | t5 | | 0 0 0 1 |
+ \ t6 / \ -1 1 1 -1 /
+
+ / r0 \ / 1 1 0 0 0 0 0 \ / s0 * t0 \
+ | r1 | = | 1 0 1 1 0 1 0 | | s1 * t1 |
+ | r2 | | 1 0 0 1 1 0 1 | | s2 * t2 |
+ \ r3 / \ 1 0 1 1 1 0 0 / | s3 * t3 |
+ | s4 * t4 |
+ | s5 * t5 |
+ \ s6 * t6 /
+*/
+
+/* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
+ *
+ * Resulting elements are of size up to rn + mn + 1.
+ *
+ * Temporary storage: 4 rn + 4 mn + 5. */
+void
+mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
+ mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
+ mp_ptr tp)
+{
+ mp_ptr s2, s3, t2, t3, u0, u1;
+ int r2s, r3s, s3s, t2s, t3s, u0s, u1s;
+ s2 = tp; tp += rn;
+ s3 = tp; tp += rn + 1;
+ t2 = tp; tp += mn;
+ t3 = tp; tp += mn + 1;
+ u0 = tp; tp += rn + mn + 1;
+ u1 = tp; /* rn + mn + 2 */
+
+ MUL (u0, r0, rn, m0, mn); /* 0 */
+ MUL (u1, r1, rn, m2, mn); /* 1 */
+
+ MPN_COPY (s2, r3, rn);
+
+ r3[rn] = mpn_add_n (r3, r3, r2, rn);
+ r0[rn] = 0;
+ s3s = abs_sub_n (s3, r3, r0, rn + 1);
+ t2s = abs_sub_n (t2, m1, m0, mn);
+ if (t2s)
+ {
+ t3[mn] = mpn_add_n (t3, m3, t2, mn);
+ t3s = 0;
+ }
+ else
+ {
+ t3s = abs_sub_n (t3, m3, t2, mn);
+ t3[mn] = 0;
+ }
+
+ r2s = abs_sub_n (r2, r0, r2, rn);
+ r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
+
+ MUL(u1, s3, rn+1, t3, mn+1); /* 3 */
+ u1s = s3s ^ t3s;
+ ASSERT (u1[rn+mn+1] == 0);
+ ASSERT (u1[rn+mn] < 4);
+
+ if (u1s)
+ {
+ u0[rn+mn] = 0;
+ u0s = abs_sub_n (u0, u0, u1, rn + mn + 1);
+ }
+ else
+ {
+ u0[rn+mn] = u1[rn+mn] + mpn_add_n (u0, u0, u1, rn + mn);
+ u0s = 0;
+ }
+ MUL(u1, r3, rn + 1, t2, mn); /* 2 */
+ u1s = t2s;
+ ASSERT (u1[rn+mn] < 2);
+
+ u1s = add_signed_n (u1, u0, u0s, u1, u1s, rn + mn + 1);
+
+ t2s = abs_sub_n (t2, m3, m1, mn);
+ if (s3s)
+ {
+ s3[rn] += mpn_add_n (s3, s3, r1, rn);
+ s3s = 0;
+ }
+ else if (s3[rn] > 0)
+ {
+ s3[rn] -= mpn_sub_n (s3, s3, r1, rn);
+ s3s = 1;
+ }
+ else
+ {
+ s3s = abs_sub_n (s3, r1, s3, rn);
+ }
+ MUL (r1, s3, rn+1, m3, mn); /* 5 */
+ ASSERT_NOCARRY(add_signed_n (r1, r1, s3s, u1, u1s, rn + mn + 1));
+ ASSERT (r1[rn + mn] < 2);
+
+ MUL (r3, r2, rn, t2, mn); /* 4 */
+ r3s = r2s ^ t2s;
+ r3[rn + mn] = 0;
+ u0s = add_signed_n (u0, u0, u0s, r3, r3s, rn + mn + 1);
+ ASSERT_NOCARRY (add_signed_n (r3, r3, r3s, u1, u1s, rn + mn + 1));
+ ASSERT (r3[rn + mn] < 2);
+
+ if (t3s)
+ {
+ t3[mn] += mpn_add_n (t3, m2, t3, mn);
+ t3s = 0;
+ }
+ else if (t3[mn] > 0)
+ {
+ t3[mn] -= mpn_sub_n (t3, t3, m2, mn);
+ t3s = 1;
+ }
+ else
+ {
+ t3s = abs_sub_n (t3, m2, t3, mn);
+ }
+ MUL (r2, s2, rn, t3, mn + 1); /* 6 */
+
+ ASSERT_NOCARRY (add_signed_n (r2, r2, t3s, u0, u0s, rn + mn + 1));
+ ASSERT (r2[rn + mn] < 2);
+}
+
+void
+mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
+ mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
+ mp_ptr tp)
+{
+ if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
+ || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
+ {
+ mp_ptr p0, p1;
+ unsigned i;
+
+ /* Temporary storage: 3 rn + 2 mn */
+ p0 = tp + rn;
+ p1 = p0 + rn + mn;
+
+ for (i = 0; i < 2; i++)
+ {
+ MPN_COPY (tp, r0, rn);
+
+ if (rn >= mn)
+ {
+ mpn_mul (p0, r0, rn, m0, mn);
+ mpn_mul (p1, r1, rn, m3, mn);
+ mpn_mul (r0, r1, rn, m2, mn);
+ mpn_mul (r1, tp, rn, m1, mn);
+ }
+ else
+ {
+ mpn_mul (p0, m0, mn, r0, rn);
+ mpn_mul (p1, m3, mn, r1, rn);
+ mpn_mul (r0, m2, mn, r1, rn);
+ mpn_mul (r1, m1, mn, tp, rn);
+ }
+ r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
+ r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
+
+ r0 = r2; r1 = r3;
+ }
+ }
+ else
+ mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
+ m0, m1, m2, m3, mn, tp);
+}
diff --git a/mpn/ia64/gmp-mparam.h b/mpn/ia64/gmp-mparam.h
index 8dd018237..22a8cfff8 100644
--- a/mpn/ia64/gmp-mparam.h
+++ b/mpn/ia64/gmp-mparam.h
@@ -37,9 +37,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 72
#define POWM_THRESHOLD 295
-#define HGCD_SCHOENHAGE_THRESHOLD 191
+#define HGCD_THRESHOLD 191
#define GCD_ACCEL_THRESHOLD 10
-#define GCD_SCHOENHAGE_THRESHOLD 336
+#define GCD_DC_THRESHOLD 336
#define GCDEXT_SCHOENHAGE_THRESHOLD 649
#define JACOBI_BASE_METHOD 1
diff --git a/mpn/m68k/gmp-mparam.h b/mpn/m68k/gmp-mparam.h
index c18bc5a63..c62304653 100644
--- a/mpn/m68k/gmp-mparam.h
+++ b/mpn/m68k/gmp-mparam.h
@@ -37,10 +37,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 55
#define POWM_THRESHOLD 65
-#define HGCD_SCHOENHAGE_THRESHOLD 116
+#define HGCD_THRESHOLD 116
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 590
-#define GCDEXT_THRESHOLD 35
+#define GCD_DC_THRESHOLD 590
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
diff --git a/mpn/minithres/gmp-mparam.h b/mpn/minithres/gmp-mparam.h
index 7586b7a0f..31b74337b 100644
--- a/mpn/minithres/gmp-mparam.h
+++ b/mpn/minithres/gmp-mparam.h
@@ -33,9 +33,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 6
#define POWM_THRESHOLD 4
-#define HGCD_SCHOENHAGE_THRESHOLD 10
+#define HGCD_THRESHOLD 10
#define GCD_ACCEL_THRESHOLD 2
-#define GCD_SCHOENHAGE_THRESHOLD 20
+#define GCD_DC_THRESHOLD 20
#define GCDEXT_SCHOENHAGE_THRESHOLD 20
#define JACOBI_BASE_METHOD 1
diff --git a/mpn/mips32/gmp-mparam.h b/mpn/mips32/gmp-mparam.h
index a5b736de3..d86fd3f01 100644
--- a/mpn/mips32/gmp-mparam.h
+++ b/mpn/mips32/gmp-mparam.h
@@ -37,7 +37,6 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define POWM_THRESHOLD 78
#define GCD_ACCEL_THRESHOLD 3
-#define GCDEXT_THRESHOLD 18
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 0 /* always */
diff --git a/mpn/mips64/gmp-mparam.h b/mpn/mips64/gmp-mparam.h
index 23b012149..d189e895c 100644
--- a/mpn/mips64/gmp-mparam.h
+++ b/mpn/mips64/gmp-mparam.h
@@ -36,10 +36,9 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 53
#define POWM_THRESHOLD 61
-#define HGCD_SCHOENHAGE_THRESHOLD 116
+#define HGCD_THRESHOLD 116
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 492
-#define GCDEXT_THRESHOLD 0 /* always */
+#define GCD_DC_THRESHOLD 492
#define JACOBI_BASE_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
diff --git a/mpn/pa32/gmp-mparam.h b/mpn/pa32/gmp-mparam.h
index 3c6d36c57..005539c0d 100644
--- a/mpn/pa32/gmp-mparam.h
+++ b/mpn/pa32/gmp-mparam.h
@@ -49,6 +49,5 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#ifndef GCD_ACCEL_THRESHOLD
#define GCD_ACCEL_THRESHOLD 46
#endif
-#ifndef GCDEXT_THRESHOLD
#define GCDEXT_THRESHOLD 33
#endif
diff --git a/mpn/pa32/hppa1_1/gmp-mparam.h b/mpn/pa32/hppa1_1/gmp-mparam.h
index d3d6d4436..5ced74548 100644
--- a/mpn/pa32/hppa1_1/gmp-mparam.h
+++ b/mpn/pa32/hppa1_1/gmp-mparam.h
@@ -34,10 +34,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 84
#define POWM_THRESHOLD 166
-#define HGCD_SCHOENHAGE_THRESHOLD 231
+#define HGCD_THRESHOLD 231
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 823
-#define GCDEXT_THRESHOLD 0 /* always */
+#define GCD_DC_THRESHOLD 823
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 5
diff --git a/mpn/pa32/hppa2_0/gmp-mparam.h b/mpn/pa32/hppa2_0/gmp-mparam.h
index 29ea97506..f5667840a 100644
--- a/mpn/pa32/hppa2_0/gmp-mparam.h
+++ b/mpn/pa32/hppa2_0/gmp-mparam.h
@@ -35,7 +35,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define POWM_THRESHOLD 166
#define GCD_ACCEL_THRESHOLD 4
-#define GCDEXT_THRESHOLD 0
#define DIVREM_1_NORM_THRESHOLD 4
#define DIVREM_1_UNNORM_THRESHOLD 6
diff --git a/mpn/pa64/gmp-mparam.h b/mpn/pa64/gmp-mparam.h
index 537da5f71..e9d058f6b 100644
--- a/mpn/pa64/gmp-mparam.h
+++ b/mpn/pa64/gmp-mparam.h
@@ -39,10 +39,9 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 123
#define POWM_THRESHOLD 212
-#define HGCD_SCHOENHAGE_THRESHOLD 292
+#define HGCD_THRESHOLD 292
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 1498
-#define GCDEXT_THRESHOLD 0 /* always */
+#define GCD_DC_THRESHOLD 1498
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 0 /* always */
diff --git a/mpn/power/gmp-mparam.h b/mpn/power/gmp-mparam.h
index 8cc6bf0c7..f9b10e6a4 100644
--- a/mpn/power/gmp-mparam.h
+++ b/mpn/power/gmp-mparam.h
@@ -30,10 +30,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 36
#define POWM_THRESHOLD 69
-#define HGCD_SCHOENHAGE_THRESHOLD 97
+#define HGCD_THRESHOLD 97
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 590
-#define GCDEXT_THRESHOLD 41
+#define GCD_DC_THRESHOLD 590
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 12
diff --git a/mpn/powerpc32/750/gmp-mparam.h b/mpn/powerpc32/750/gmp-mparam.h
index f20fd665f..d604e6ed4 100644
--- a/mpn/powerpc32/750/gmp-mparam.h
+++ b/mpn/powerpc32/750/gmp-mparam.h
@@ -35,10 +35,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 35
#define POWM_THRESHOLD 48
-#define HGCD_SCHOENHAGE_THRESHOLD 93
+#define HGCD_THRESHOLD 93
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 676
-#define GCDEXT_THRESHOLD 31
+#define GCD_DC_THRESHOLD 676
#define JACOBI_BASE_METHOD 1
#define DIVREM_1_NORM_THRESHOLD 0 /* always */
diff --git a/mpn/powerpc32/gmp-mparam.h b/mpn/powerpc32/gmp-mparam.h
index 0387e2fb7..a77c98e8a 100644
--- a/mpn/powerpc32/gmp-mparam.h
+++ b/mpn/powerpc32/gmp-mparam.h
@@ -41,10 +41,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 45
#define POWM_THRESHOLD 89
-#define HGCD_SCHOENHAGE_THRESHOLD 145
+#define HGCD_THRESHOLD 145
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 738
-#define GCDEXT_THRESHOLD 16
+#define GCD_DC_THRESHOLD 738
#define JACOBI_BASE_METHOD 1
#define DIVREM_1_NORM_THRESHOLD 0 /* always */
diff --git a/mpn/powerpc64/gmp-mparam.h b/mpn/powerpc64/gmp-mparam.h
index 6fe8a8d40..e0ab478e3 100644
--- a/mpn/powerpc64/gmp-mparam.h
+++ b/mpn/powerpc64/gmp-mparam.h
@@ -37,10 +37,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 28
#define POWM_THRESHOLD 40
-#define HGCD_SCHOENHAGE_THRESHOLD 56
+#define HGCD_THRESHOLD 56
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 408
-#define GCDEXT_THRESHOLD 151
+#define GCD_DC_THRESHOLD 408
#define JACOBI_BASE_METHOD 1
#define MOD_1_NORM_THRESHOLD 0 /* always */
diff --git a/mpn/s390/gmp-mparam.h b/mpn/s390/gmp-mparam.h
index b09191456..d73884667 100644
--- a/mpn/s390/gmp-mparam.h
+++ b/mpn/s390/gmp-mparam.h
@@ -35,7 +35,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define POWM_THRESHOLD 63
#define GCD_ACCEL_THRESHOLD 3
-#define GCDEXT_THRESHOLD 28
#define DIVREM_1_NORM_THRESHOLD 0
#define DIVREM_1_UNNORM_THRESHOLD 5
diff --git a/mpn/sparc32/gmp-mparam.h b/mpn/sparc32/gmp-mparam.h
index d275da51a..3bc6cd6db 100644
--- a/mpn/sparc32/gmp-mparam.h
+++ b/mpn/sparc32/gmp-mparam.h
@@ -32,7 +32,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define POWM_THRESHOLD 28
#define GCD_ACCEL_THRESHOLD 3
-#define GCDEXT_THRESHOLD 0 /* always */
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 3
diff --git a/mpn/sparc32/v8/gmp-mparam.h b/mpn/sparc32/v8/gmp-mparam.h
index fde006e08..f042c19e5 100644
--- a/mpn/sparc32/v8/gmp-mparam.h
+++ b/mpn/sparc32/v8/gmp-mparam.h
@@ -35,10 +35,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 24
#define POWM_THRESHOLD 38
-#define HGCD_SCHOENHAGE_THRESHOLD 69
+#define HGCD_THRESHOLD 69
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 498
-#define GCDEXT_THRESHOLD 0 /* always */
+#define GCD_DC_THRESHOLD 498
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 6
diff --git a/mpn/sparc32/v8/supersparc/gmp-mparam.h b/mpn/sparc32/v8/supersparc/gmp-mparam.h
index c6f2d83eb..feb90ef40 100644
--- a/mpn/sparc32/v8/supersparc/gmp-mparam.h
+++ b/mpn/sparc32/v8/supersparc/gmp-mparam.h
@@ -35,10 +35,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 26
#define POWM_THRESHOLD 79
-#define HGCD_SCHOENHAGE_THRESHOLD 97
+#define HGCD_THRESHOLD 97
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 470
-#define GCDEXT_THRESHOLD 14
+#define GCD_DC_THRESHOLD 470
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 0 /* always */
diff --git a/mpn/sparc32/v9/gmp-mparam.h b/mpn/sparc32/v9/gmp-mparam.h
index 2f11e400e..3d48d743b 100644
--- a/mpn/sparc32/v9/gmp-mparam.h
+++ b/mpn/sparc32/v9/gmp-mparam.h
@@ -34,10 +34,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 125
#define POWM_THRESHOLD 150
-#define HGCD_SCHOENHAGE_THRESHOLD 210
+#define HGCD_THRESHOLD 210
#define GCD_ACCEL_THRESHOLD 4
-#define GCD_SCHOENHAGE_THRESHOLD 1291
-#define GCDEXT_THRESHOLD 9
+#define GCD_DC_THRESHOLD 1291
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
diff --git a/mpn/sparc64/gmp-mparam.h b/mpn/sparc64/gmp-mparam.h
index 4bceda1db..9c59e698f 100644
--- a/mpn/sparc64/gmp-mparam.h
+++ b/mpn/sparc64/gmp-mparam.h
@@ -44,7 +44,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define POWM_THRESHOLD 85
#define GCD_ACCEL_THRESHOLD 3
-#define GCDEXT_THRESHOLD 20
#define JACOBI_BASE_METHOD 2
#define DIVREM_1_NORM_THRESHOLD 3
diff --git a/mpn/vax/gmp-mparam.h b/mpn/vax/gmp-mparam.h
index 4b7a2156d..ea262ddc4 100644
--- a/mpn/vax/gmp-mparam.h
+++ b/mpn/vax/gmp-mparam.h
@@ -32,7 +32,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* #define POWM_THRESHOLD */
/* #define GCD_ACCEL_THRESHOLD */
-#define GCDEXT_THRESHOLD 40
/* #define JACOBI_BASE_METHOD */
/* #define DIVREM_1_NORM_THRESHOLD */
diff --git a/mpn/x86/i486/gmp-mparam.h b/mpn/x86/i486/gmp-mparam.h
index f064a3e69..aaddea9f1 100644
--- a/mpn/x86/i486/gmp-mparam.h
+++ b/mpn/x86/i486/gmp-mparam.h
@@ -37,7 +37,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define POWM_THRESHOLD 38
#define GCD_ACCEL_THRESHOLD 3
-#define GCDEXT_THRESHOLD 55
#define JACOBI_BASE_METHOD 2
#define USE_PREINV_DIVREM_1 0
diff --git a/mpn/x86/k6/gmp-mparam.h b/mpn/x86/k6/gmp-mparam.h
index fc3303880..dbf8c59c8 100644
--- a/mpn/x86/k6/gmp-mparam.h
+++ b/mpn/x86/k6/gmp-mparam.h
@@ -37,10 +37,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 76
#define POWM_THRESHOLD 97
-#define HGCD_SCHOENHAGE_THRESHOLD 242
+#define HGCD_THRESHOLD 242
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 1243
-#define GCDEXT_THRESHOLD 40
+#define GCD_DC_THRESHOLD 1243
#define JACOBI_BASE_METHOD 2
#define USE_PREINV_DIVREM_1 0
diff --git a/mpn/x86/k7/gmp-mparam.h b/mpn/x86/k7/gmp-mparam.h
index a3927784d..5c5c1195e 100644
--- a/mpn/x86/k7/gmp-mparam.h
+++ b/mpn/x86/k7/gmp-mparam.h
@@ -41,9 +41,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 84
#define POWM_THRESHOLD 134
-#define HGCD_SCHOENHAGE_THRESHOLD 220
+#define HGCD_THRESHOLD 220
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 908
+#define GCD_DC_THRESHOLD 908
#define GCDEXT_SCHOENHAGE_THRESHOLD 683
#define JACOBI_BASE_METHOD 1
diff --git a/mpn/x86/p6/gmp-mparam.h b/mpn/x86/p6/gmp-mparam.h
index 217facab4..a85c50027 100644
--- a/mpn/x86/p6/gmp-mparam.h
+++ b/mpn/x86/p6/gmp-mparam.h
@@ -45,7 +45,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define POWM_THRESHOLD 131
#define GCD_ACCEL_THRESHOLD 3
-#define GCDEXT_THRESHOLD 33
#define JACOBI_BASE_METHOD 1
#define USE_PREINV_DIVREM_1 0
diff --git a/mpn/x86/p6/mmx/gmp-mparam.h b/mpn/x86/p6/mmx/gmp-mparam.h
index 1456b53a1..c1fa872f0 100644
--- a/mpn/x86/p6/mmx/gmp-mparam.h
+++ b/mpn/x86/p6/mmx/gmp-mparam.h
@@ -54,9 +54,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DC_BDIV_Q_THRESHOLD 10
#define DIVEXACT_JEB_THRESHOLD 48
-#define HGCD_SCHOENHAGE_THRESHOLD 145
+#define HGCD_THRESHOLD 145
#define GCD_ACCEL_THRESHOLD 5
-#define GCD_SCHOENHAGE_THRESHOLD 537
+#define GCD_DC_THRESHOLD 537
#define GCDEXT_SCHOENHAGE_THRESHOLD 948
#define JACOBI_BASE_METHOD 1
diff --git a/mpn/x86/pentium/gmp-mparam.h b/mpn/x86/pentium/gmp-mparam.h
index c7f398da8..5c49c4e3c 100644
--- a/mpn/x86/pentium/gmp-mparam.h
+++ b/mpn/x86/pentium/gmp-mparam.h
@@ -42,10 +42,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 52
#define POWM_THRESHOLD 77
-#define HGCD_SCHOENHAGE_THRESHOLD 121
+#define HGCD_THRESHOLD 121
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 615
-#define GCDEXT_THRESHOLD 13
+#define GCD_DC_THRESHOLD 615
#define JACOBI_BASE_METHOD 2
#define USE_PREINV_DIVREM_1 0
diff --git a/mpn/x86/pentium/mmx/gmp-mparam.h b/mpn/x86/pentium/mmx/gmp-mparam.h
index 40eaecd6f..aae5fec48 100644
--- a/mpn/x86/pentium/mmx/gmp-mparam.h
+++ b/mpn/x86/pentium/mmx/gmp-mparam.h
@@ -42,9 +42,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIV_DC_THRESHOLD 37
#define POWM_THRESHOLD 73
-#define HGCD_SCHOENHAGE_THRESHOLD 97
+#define HGCD_THRESHOLD 97
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 849
+#define GCD_DC_THRESHOLD 849
#define GCDEXT_THRESHOLD 14
#define JACOBI_BASE_METHOD 2
diff --git a/mpn/x86/pentium4/sse2/gmp-mparam.h b/mpn/x86/pentium4/sse2/gmp-mparam.h
index 113356dcc..3ad7a93a1 100644
--- a/mpn/x86/pentium4/sse2/gmp-mparam.h
+++ b/mpn/x86/pentium4/sse2/gmp-mparam.h
@@ -48,9 +48,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DC_BDIV_Q_THRESHOLD 10
#define DIVEXACT_JEB_THRESHOLD 80
-#define HGCD_SCHOENHAGE_THRESHOLD 101
+#define HGCD_THRESHOLD 101
#define GCD_ACCEL_THRESHOLD 6
-#define GCD_SCHOENHAGE_THRESHOLD 341
+#define GCD_DC_THRESHOLD 341
#define GCDEXT_SCHOENHAGE_THRESHOLD 375
#define JACOBI_BASE_METHOD 1
diff --git a/mpn/x86_64/core2/gmp-mparam.h b/mpn/x86_64/core2/gmp-mparam.h
index e4a4ea2e8..44e3af47d 100644
--- a/mpn/x86_64/core2/gmp-mparam.h
+++ b/mpn/x86_64/core2/gmp-mparam.h
@@ -49,10 +49,10 @@ MA 02110-1301, USA. */
#define DC_BDIV_Q_THRESHOLD 10
#define DIVEXACT_JEB_THRESHOLD 40
-#define HGCD_SCHOENHAGE_THRESHOLD 191
-#define GCD_ACCEL_THRESHOLD 5
-#define GCD_SCHOENHAGE_THRESHOLD 948
-#define GCDEXT_SCHOENHAGE_THRESHOLD 254
+#define MATRIX22_STRASSEN_THRESHOLD 25
+#define HGCD_THRESHOLD 191
+#define GCD_DC_THRESHOLD 948
+#define GCDEXT_DC_THRESHOLD 254
#define JACOBI_BASE_METHOD 1
#define MOD_1_NORM_THRESHOLD 0 /* always */
diff --git a/mpn/x86_64/gmp-mparam.h b/mpn/x86_64/gmp-mparam.h
index 3c3d94315..fc2cd275c 100644
--- a/mpn/x86_64/gmp-mparam.h
+++ b/mpn/x86_64/gmp-mparam.h
@@ -47,10 +47,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DC_BDIV_Q_THRESHOLD 10
#define DIVEXACT_JEB_THRESHOLD 50
-#define HGCD_SCHOENHAGE_THRESHOLD 145
+#define MATRIX22_STRASSEN_THRESHOLD 22
+#define HGCD_THRESHOLD 111
#define GCD_ACCEL_THRESHOLD 3
-#define GCD_SCHOENHAGE_THRESHOLD 445
-#define GCDEXT_SCHOENHAGE_THRESHOLD 713
+#define GCD_DC_THRESHOLD 412
+#define GCDEXT_DC_THRESHOLD 390
#define JACOBI_BASE_METHOD 1
#define MOD_1_NORM_THRESHOLD 0 /* always */
diff --git a/mpn/x86_64/pentium4/gmp-mparam.h b/mpn/x86_64/pentium4/gmp-mparam.h
index e1c56bcac..afb106f59 100644
--- a/mpn/x86_64/pentium4/gmp-mparam.h
+++ b/mpn/x86_64/pentium4/gmp-mparam.h
@@ -54,9 +54,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DC_BDIV_Q_THRESHOLD 10
#define DIVEXACT_JEB_THRESHOLD 27
-#define HGCD_SCHOENHAGE_THRESHOLD 133
+#define HGCD_THRESHOLD 133
#define GCD_ACCEL_THRESHOLD 10
-#define GCD_SCHOENHAGE_THRESHOLD 792
+#define GCD_DC_THRESHOLD 792
#define GCDEXT_SCHOENHAGE_THRESHOLD 339
#define JACOBI_BASE_METHOD 1
diff --git a/tests/mpn/Makefile.am b/tests/mpn/Makefile.am
index decce7182..f67138a6c 100644
--- a/tests/mpn/Makefile.am
+++ b/tests/mpn/Makefile.am
@@ -22,7 +22,7 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/tests
LDADD = $(top_builddir)/tests/libtests.la $(top_builddir)/libgmp.la
check_PROGRAMS = t-asmtype t-aors_1 t-divrem_1 t-fat t-get_d \
- t-instrument t-iord_u t-mp_bases t-perfsqr t-scan t-hgcd
+ t-instrument t-iord_u t-mp_bases t-perfsqr t-scan t-hgcd t-matrix22
TESTS = $(check_PROGRAMS)
diff --git a/tests/mpn/t-hgcd.c b/tests/mpn/t-hgcd.c
index 94d4ca95a..2615fd679 100644
--- a/tests/mpn/t-hgcd.c
+++ b/tests/mpn/t-hgcd.c
@@ -25,7 +25,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "tests.h"
-static int one_test __GMP_PROTO ((mpz_t, mpz_t, int));
+static mp_size_t one_test __GMP_PROTO ((mpz_t, mpz_t, int));
static void debug_mp __GMP_PROTO ((mpz_t, int));
#define MIN_OPERAND_SIZE 2
@@ -34,31 +34,26 @@ static void debug_mp __GMP_PROTO ((mpz_t, int));
struct value { int res; const char *a; const char *b; };
static const struct value hgcd_values[] = {
#if GMP_NUMB_BITS == 32
- { 4,
+ { 5,
"0x1bddff867272a9296ac493c251d7f46f09a5591fe",
"0xb55930a2a68a916450a7de006031068c5ddb0e5c" },
{ 4,
"0x2f0ece5b1ee9c15e132a01d55768dc13",
"0x1c6f4fd9873cdb24466e6d03e1cc66e7" },
- { 4, "0x7FFFFC003FFFFFFFFFC5", "0x3FFFFE001FFFFFFFFFE3"},
+ { 3, "0x7FFFFC003FFFFFFFFFC5", "0x3FFFFE001FFFFFFFFFE3"},
#endif
{ -1, NULL, NULL }
};
struct hgcd_ref
{
- /* Sign here, u and v are stored as absolute values */
- int sign;
-
- mpz_t r[4];
- mpz_t u[4];
- mpz_t v[4];
+ mpz_t m[2][2];
};
static void hgcd_ref_init __GMP_PROTO ((struct hgcd_ref *hgcd));
static void hgcd_ref_clear __GMP_PROTO ((struct hgcd_ref *hgcd));
-static int hgcd_ref __GMP_PROTO ((struct hgcd_ref *hgcd, const mpz_t a, const mpz_t b));
-static int hgcd_ref_equal __GMP_PROTO ((const struct hgcd *hgcd, const struct hgcd_ref *ref));
+static int hgcd_ref __GMP_PROTO ((struct hgcd_ref *hgcd, mpz_t a, mpz_t b));
+static int hgcd_ref_equal __GMP_PROTO ((const struct hgcd_matrix *hgcd, const struct hgcd_ref *ref));
int
main (int argc, char **argv)
@@ -80,7 +75,7 @@ main (int argc, char **argv)
for (i = 0; hgcd_values[i].res >= 0; i++)
{
- int res;
+ mp_size_t res;
mpz_set_str (op1, hgcd_values[i].a, 0);
mpz_set_str (op2, hgcd_values[i].b, 0);
@@ -117,7 +112,7 @@ main (int argc, char **argv)
if (mpz_cmp (op1, op2) < 0)
mpz_swap (op1, op2);
- if (mpz_size(op1) > 0)
+ if (mpz_size (op1) > 0)
one_test (op1, op2, i);
/* Generate a division chain backwards, allowing otherwise
@@ -133,7 +128,7 @@ main (int argc, char **argv)
chain_len = 1000000;
#else
mpz_urandomb (bs, rands, 32);
- chain_len = mpz_get_ui (bs) % (GMP_NUMB_BITS * GCD_SCHOENHAGE_THRESHOLD / 256);
+ chain_len = mpz_get_ui (bs) % (GMP_NUMB_BITS * GCD_DC_THRESHOLD / 256);
#endif
for (j = 0; j < chain_len; j++)
@@ -146,7 +141,7 @@ main (int argc, char **argv)
mpz_add (op1, op1, temp1);
/* Don't generate overly huge operands. */
- if (SIZ (op1) > 3 * GCD_SCHOENHAGE_THRESHOLD)
+ if (SIZ (op1) > 3 * GCD_DC_THRESHOLD)
break;
mpz_urandomb (bs, rands, 32);
@@ -157,13 +152,13 @@ main (int argc, char **argv)
mpz_add (op2, op2, temp1);
/* Don't generate overly huge operands. */
- if (SIZ (op2) > 3 * GCD_SCHOENHAGE_THRESHOLD)
+ if (SIZ (op2) > 3 * GCD_DC_THRESHOLD)
break;
}
if (mpz_cmp (op1, op2) < 0)
mpz_swap (op1, op2);
- if (mpz_size(op1) > 0)
+ if (mpz_size (op1) > 0)
one_test (op1, op2, i);
}
@@ -177,33 +172,37 @@ debug_mp (mpz_t x, int base)
}
static int
+mpz_mpn_equal (const mpz_t a, mp_srcptr bp, mp_size_t bsize);
+
+static mp_size_t
one_test (mpz_t a, mpz_t b, int i)
{
- struct hgcd hgcd;
+ struct hgcd_matrix hgcd;
struct hgcd_ref ref;
- struct qstack quotients;
- int res[2];
+
+ mpz_t ref_r0;
+ mpz_t ref_r1;
+ mpz_t hgcd_r0;
+ mpz_t hgcd_r1;
+
+ mp_size_t res[2];
mp_size_t asize;
mp_size_t bsize;
mp_size_t hgcd_init_scratch;
- mp_size_t qstack_scratch;
mp_size_t hgcd_scratch;
mp_ptr hgcd_init_tp;
- mp_ptr qstack_tp;
mp_ptr hgcd_tp;
asize = a->_mp_size;
bsize = b->_mp_size;
- hgcd_init_scratch = mpn_hgcd_init_itch (asize);
- hgcd_init_tp = refmpn_malloc_limbs (hgcd_init_scratch);
- mpn_hgcd_init (&hgcd, asize, hgcd_init_tp);
+ ASSERT (asize >= bsize);
- qstack_scratch = qstack_itch (asize);
- qstack_tp = refmpn_malloc_limbs (qstack_scratch);
- qstack_init (&quotients, asize, qstack_tp, qstack_scratch);
+ hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (asize);
+ hgcd_init_tp = refmpn_malloc_limbs (hgcd_init_scratch);
+ mpn_hgcd_matrix_init (&hgcd, asize, hgcd_init_tp);
hgcd_scratch = mpn_hgcd_itch (asize);
hgcd_tp = refmpn_malloc_limbs (hgcd_scratch);
@@ -221,28 +220,37 @@ one_test (mpz_t a, mpz_t b, int i)
#endif
hgcd_ref_init (&ref);
- res[0] = hgcd_ref (&ref, a, b);
- res[1] = mpn_hgcd (&hgcd,
- a->_mp_d, asize,
- b->_mp_d, bsize,
- &quotients,
- hgcd_tp, hgcd_scratch);
+ mpz_init_set (ref_r0, a);
+ mpz_init_set (ref_r1, b);
+ res[0] = hgcd_ref (&ref, ref_r0, ref_r1);
+
+ mpz_init_set (hgcd_r0, a);
+ mpz_init_set (hgcd_r1, b);
+ if (bsize < asize)
+ {
+ _mpz_realloc (hgcd_r1, asize);
+ MPN_ZERO (hgcd_r1->_mp_d + bsize, asize - bsize);
+ }
+ res[1] = mpn_hgcd (hgcd_r0->_mp_d,
+ hgcd_r1->_mp_d,
+ asize,
+ &hgcd, hgcd_tp);
if (res[0] != res[1])
{
fprintf (stderr, "ERROR in test %d\n", i);
- fprintf (stderr, "Different return code from hgcd and hgcd_ref\n");
+ fprintf (stderr, "Different return value from hgcd and hgcd_ref\n");
fprintf (stderr, "op1="); debug_mp (a, -16);
fprintf (stderr, "op2="); debug_mp (b, -16);
- fprintf (stderr, "hgcd_ref: %d\n", res[0]);
- fprintf (stderr, "mpn_hgcd: %d\n", res[1]);
+ fprintf (stderr, "hgcd_ref: %ld\n", (long) res[0]);
+ fprintf (stderr, "mpn_hgcd: %ld\n", (long) res[1]);
abort ();
}
if (res[0] > 0)
{
- ASSERT_HGCD (&hgcd, a->_mp_d, asize, b->_mp_d, bsize, 0, 4);
-
- if (!hgcd_ref_equal (&hgcd, &ref))
+ if (!hgcd_ref_equal (&hgcd, &ref)
+ || !mpz_mpn_equal (ref_r0, hgcd_r0->_mp_d, res[1])
+ || !mpz_mpn_equal (ref_r1, hgcd_r1->_mp_d, res[1]))
{
fprintf (stderr, "ERROR in test %d\n", i);
fprintf (stderr, "mpn_hgcd and hgcd_ref returned different values\n");
@@ -253,9 +261,12 @@ one_test (mpz_t a, mpz_t b, int i)
}
refmpn_free_limbs (hgcd_init_tp);
- refmpn_free_limbs (qstack_tp);
refmpn_free_limbs (hgcd_tp);
hgcd_ref_clear (&ref);
+ mpz_clear (ref_r0);
+ mpz_clear (ref_r1);
+ mpz_clear (hgcd_r0);
+ mpz_clear (hgcd_r1);
return res[0];
}
@@ -264,11 +275,11 @@ static void
hgcd_ref_init (struct hgcd_ref *hgcd)
{
unsigned i;
- for (i = 0; i<4; i++)
+ for (i = 0; i<2; i++)
{
- mpz_init (hgcd->r[i]);
- mpz_init (hgcd->u[i]);
- mpz_init (hgcd->v[i]);
+ unsigned j;
+ for (j = 0; j<2; j++)
+ mpz_init (hgcd->m[i][j]);
}
}
@@ -276,137 +287,91 @@ static void
hgcd_ref_clear (struct hgcd_ref *hgcd)
{
unsigned i;
- for (i = 0; i<4; i++)
+ for (i = 0; i<2; i++)
{
- mpz_clear (hgcd->r[i]);
- mpz_clear (hgcd->u[i]);
- mpz_clear (hgcd->v[i]);
+ unsigned j;
+ for (j = 0; j<2; j++)
+ mpz_clear (hgcd->m[i][j]);
}
}
+
static int
-hgcd_ref (struct hgcd_ref *hgcd, const mpz_t a, const mpz_t b)
+sdiv_qr (mpz_t q, mpz_t r, mp_size_t s, const mpz_t a, const mpz_t b)
{
- mp_size_t M = (a->_mp_size + 1) / 2;
- mpz_t t;
+ mpz_fdiv_qr (q, r, a, b);
+ if (mpz_size (r) <= s)
+ {
+ mpz_add (r, r, b);
+ mpz_sub_ui (q, q, 1);
+ }
+
+ return (mpz_sgn (q) > 0);
+}
+
+static int
+hgcd_ref (struct hgcd_ref *hgcd, mpz_t a, mpz_t b)
+{
+ mp_size_t n = MAX (mpz_size (a), mpz_size (b));
+ mp_size_t s = n/2 + 1;
+ mp_size_t asize;
+ mp_size_t bsize;
mpz_t q;
int res;
- if (mpz_size(b) <= M)
+ if (mpz_size (a) <= s || mpz_size (b) <= s)
return 0;
- mpz_init (q);
- mpz_fdiv_qr(q, hgcd->r[2], a, b);
-
- if (mpz_size (hgcd->r[2]) <= M)
+ res = mpz_cmp (a, b);
+ if (res < 0)
{
- mpz_clear (q);
- return 0;
- }
-
- mpz_set (hgcd->r[0], a); mpz_set (hgcd->r[1], b);
+ mpz_sub (b, b, a);
+ if (mpz_size (b) <= s)
+ return 0;
- mpz_set_ui (hgcd->u[0], 1); mpz_set_ui (hgcd->v[0], 0);
- mpz_set_ui (hgcd->u[1], 0); mpz_set_ui (hgcd->v[1], 1);
- mpz_set_ui (hgcd->u[2], 1); mpz_set (hgcd->v[2], q);
+ mpz_set_ui (hgcd->m[0][0], 1); mpz_set_ui (hgcd->m[0][1], 0);
+ mpz_set_ui (hgcd->m[1][0], 1); mpz_set_ui (hgcd->m[1][1], 1);
+ }
+ else if (res > 0)
+ {
+ mpz_sub (a, a, b);
+ if (mpz_size (a) <= s)
+ return 0;
- hgcd->sign = 0;
+ mpz_set_ui (hgcd->m[0][0], 1); mpz_set_ui (hgcd->m[0][1], 1);
+ mpz_set_ui (hgcd->m[1][0], 0); mpz_set_ui (hgcd->m[1][1], 1);
+ }
+ else
+ return 0;
- mpz_init (t);
+ mpz_init (q);
for (;;)
{
- mpz_fdiv_qr(q, hgcd->r[3], hgcd->r[1], hgcd->r[2]);
+ ASSERT (mpz_size (a) > s);
+ ASSERT (mpz_size (b) > s);
- mpz_mul (hgcd->u[3], q, hgcd->u[2]);
- mpz_add (hgcd->u[3], hgcd->u[3], hgcd->u[1]);
-
- mpz_mul (hgcd->v[3], q, hgcd->v[2]);
- mpz_add (hgcd->v[3], hgcd->v[3], hgcd->v[1]);
-
- if (mpz_size (hgcd->r[3]) <= M)
+ if (mpz_cmp (a, b) > 0)
{
-#if 0
- unsigned i;
- printf("hgcd_ref: sign = %d\n", hgcd->sign);
- for (i = 0; i < 4; i++)
- gmp_printf("r = %Zd, u = %Zd, v = %Zd\n",
- hgcd->r[i], hgcd->u[i], hgcd->v[i]);
-#endif
- /* Check Jebelean's criterion */
-
- if (hgcd->sign >= 0)
- {
- /* Check if r1 - r2 >= u2 - u1 */
- mpz_add (t, hgcd->u[2], hgcd->u[1]);
- }
- else
- {
- /* Check if r1 - r2 >= v2 - v1 */
- mpz_add (t, hgcd->v[2], hgcd->v[1]);
- }
-
- /* Check r1 >= t + r2 */
- mpz_add (t, t, hgcd->r[2]);
- if (mpz_cmp (hgcd->r[1], t) < 0)
- {
- res = 2; break;
- }
-
- /* Now r2 is correct */
- if (hgcd->sign >= 0)
- {
- /* Check r3 >= max (-u3, -v3) = u3 */
- if (mpz_cmp (hgcd->r[3], hgcd->u[3]) < 0)
- {
- res = 3; break;
- }
-
- /* Check r3 - r2 >= v3 - v2 */
- mpz_add (t, hgcd->v[3], hgcd->v[2]);
- }
- else
- {
- /* Check r3 >= max (-u3, -v3) = v3 */
- if (mpz_cmp (hgcd->r[3], hgcd->v[3]) < 0)
- {
- res = 3; break;
- }
-
- /* Check r3 - r2 >= u3 - u2 */
- mpz_add (t, hgcd->u[3], hgcd->u[2]);
- }
-
- /* Check r2 >= t + r3 */
- mpz_add (t, t, hgcd->r[3]);
- if (mpz_cmp (hgcd->r[2], t) < 0)
- {
- res = 3; break;
- }
-
- /* Now r3 is correct */
- res = 4; break;
+ if (!sdiv_qr (q, a, s, a, b))
+ break;
+ mpz_addmul (hgcd->m[0][1], q, hgcd->m[0][0]);
+ mpz_addmul (hgcd->m[1][1], q, hgcd->m[1][0]);
+ }
+ else
+ {
+ if (!sdiv_qr (q, b, s, b, a))
+ break;
+ mpz_addmul (hgcd->m[0][0], q, hgcd->m[0][1]);
+ mpz_addmul (hgcd->m[1][0], q, hgcd->m[1][1]);
}
-
- /* Shift rows */
- hgcd->sign = ~hgcd->sign;
- mpz_swap (hgcd->r[0], hgcd->r[1]);
- mpz_swap (hgcd->r[1], hgcd->r[2]);
- mpz_swap (hgcd->r[2], hgcd->r[3]);
-
- mpz_swap (hgcd->u[0], hgcd->u[1]);
- mpz_swap (hgcd->u[1], hgcd->u[2]);
- mpz_swap (hgcd->u[2], hgcd->u[3]);
-
- mpz_swap (hgcd->v[0], hgcd->v[1]);
- mpz_swap (hgcd->v[1], hgcd->v[2]);
- mpz_swap (hgcd->v[2], hgcd->v[3]);
}
- mpz_clear (t);
mpz_clear (q);
- return res;
+ asize = mpz_size (a);
+ bsize = mpz_size (b);
+ return MAX (asize, bsize);
}
static int
@@ -416,25 +381,22 @@ mpz_mpn_equal (const mpz_t a, mp_srcptr bp, mp_size_t bsize)
mp_size_t asize = a->_mp_size;
MPN_NORMALIZE (bp, bsize);
- return asize == bsize && mpn_cmp(ap, bp, asize) == 0;
+ return asize == bsize && mpn_cmp (ap, bp, asize) == 0;
}
static int
-hgcd_ref_equal (const struct hgcd *hgcd, const struct hgcd_ref *ref)
+hgcd_ref_equal (const struct hgcd_matrix *hgcd, const struct hgcd_ref *ref)
{
unsigned i;
- if (ref->sign != hgcd->sign)
- return 0;
-
- for (i = 0; i<4; i++)
+ for (i = 0; i<2; i++)
{
- if (!mpz_mpn_equal (ref->r[i], hgcd->row[i].rp, hgcd->row[i].rsize))
- return 0;
- if (!mpz_mpn_equal (ref->u[i], hgcd->row[i].uvp[0], hgcd->size))
- return 0;
- if (!mpz_mpn_equal (ref->v[i], hgcd->row[i].uvp[1], hgcd->size))
- return 0;
+ unsigned j;
+
+ for (j = 0; j<2; j++)
+ if (!mpz_mpn_equal (ref->m[i][j], hgcd->p[i][j], hgcd->n))
+ return 0;
}
+
return 1;
}
diff --git a/tests/mpn/t-matrix22.c b/tests/mpn/t-matrix22.c
new file mode 100644
index 000000000..17d1dc614
--- /dev/null
+++ b/tests/mpn/t-matrix22.c
@@ -0,0 +1,207 @@
+/* Tests matrix22_mul.
+
+Copyright 2008 Free
+Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "tests.h"
+
+struct matrix {
+ mp_size_t alloc;
+ mp_size_t n;
+ mp_ptr e00, e01, e10, e11;
+};
+
+static void
+matrix_init (struct matrix *M, mp_size_t n)
+{
+ mp_ptr p = refmpn_malloc_limbs (4*(n+1));
+ M->e00 = p; p += n+1;
+ M->e01 = p; p += n+1;
+ M->e10 = p; p += n+1;
+ M->e11 = p;
+ M->alloc = n + 1;
+ M->n = 0;
+}
+
+static void
+matrix_clear (struct matrix *M)
+{
+ refmpn_free_limbs (M->e00);
+}
+
+static void
+matrix_copy (struct matrix *R, const struct matrix *M)
+{
+ R->n = M->n;
+ MPN_COPY (R->e00, M->e00, M->n);
+ MPN_COPY (R->e01, M->e01, M->n);
+ MPN_COPY (R->e10, M->e10, M->n);
+ MPN_COPY (R->e11, M->e11, M->n);
+}
+
+/* Used with same size, so no need for normalization. */
+static int
+matrix_equal_p (const struct matrix *A, const struct matrix *B)
+{
+ return (A->n == B->n
+ && mpn_cmp (A->e00, B->e00, A->n) == 0
+ && mpn_cmp (A->e01, B->e01, A->n) == 0
+ && mpn_cmp (A->e10, B->e10, A->n) == 0
+ && mpn_cmp (A->e11, B->e11, A->n) == 0);
+}
+
+static void
+matrix_random(struct matrix *M, mp_size_t n, gmp_randstate_ptr rands)
+{
+ M->n = n;
+ mpn_random (M->e00, n);
+ mpn_random (M->e01, n);
+ mpn_random (M->e10, n);
+ mpn_random (M->e11, n);
+}
+
+#define MUL(rp, ap, an, bp, bn) do { \
+ if (an > bn) \
+ mpn_mul (rp, ap, an, bp, bn); \
+ else \
+ mpn_mul (rp, bp, bn, ap, an); \
+ } while(0)
+
+static void
+ref_matrix22_mul (struct matrix *R,
+ const struct matrix *A,
+ const struct matrix *B, mp_ptr tp)
+{
+ mp_size_t an, bn, n;
+ mp_ptr r00, r01, r10, r11, a00, a01, a10, a11, b00, b01, b10, b11;
+
+ if (A->n >= B->n)
+ {
+ r00 = R->e00; a00 = A->e00; b00 = B->e00;
+ r01 = R->e01; a01 = A->e01; b01 = B->e01;
+ r10 = R->e10; a10 = A->e10; b10 = B->e10;
+ r11 = R->e11; a11 = A->e11; b11 = B->e11;
+ an = A->n, bn = B->n;
+ }
+ else
+ {
+ /* Transpose */
+ r00 = R->e00; a00 = B->e00; b00 = A->e00;
+ r01 = R->e10; a01 = B->e10; b01 = A->e10;
+ r10 = R->e01; a10 = B->e01; b10 = A->e01;
+ r11 = R->e11; a11 = B->e11; b11 = A->e11;
+ an = B->n, bn = A->n;
+ }
+ n = an + bn;
+ R->n = n + 1;
+
+ mpn_mul (r00, a00, an, b00, bn);
+ mpn_mul (tp, a01, an, b10, bn);
+ r00[n] = mpn_add_n (r00, r00, tp, n);
+
+ mpn_mul (r01, a00, an, b01, bn);
+ mpn_mul (tp, a01, an, b11, bn);
+ r01[n] = mpn_add_n (r01, r01, tp, n);
+
+ mpn_mul (r10, a10, an, b00, bn);
+ mpn_mul (tp, a11, an, b10, bn);
+ r10[n] = mpn_add_n (r10, r10, tp, n);
+
+ mpn_mul (r11, a10, an, b01, bn);
+ mpn_mul (tp, a11, an, b11, bn);
+ r11[n] = mpn_add_n (r11, r11, tp, n);
+}
+
+static void
+one_test (const struct matrix *A, const struct matrix *B, int i)
+{
+ struct matrix R;
+ struct matrix P;
+ mp_ptr tp;
+
+ matrix_init (&R, A->n + B->n + 1);
+ matrix_init (&P, A->n + B->n + 1);
+
+ tp = refmpn_malloc_limbs (mpn_matrix22_mul_itch (A->n, B->n));
+
+ ref_matrix22_mul (&R, A, B, tp);
+ matrix_copy (&P, A);
+ mpn_matrix22_mul (P.e00, P.e01, P.e10, P.e11, A->n,
+ B->e00, B->e01, B->e10, B->e11, B->n, tp);
+ P.n = A->n + B->n + 1;
+ if (!matrix_equal_p (&R, &P))
+ {
+ fprintf (stderr, "ERROR in test %d\n", i);
+ gmp_fprintf (stderr, "A = (%Nx, %Nx\n %Nx, %Nx)\n"
+ "B = (%Nx, %Nx\n %Nx, %Nx)\n"
+ "R = (%Nx, %Nx (expected)\n %Nx, %Nx)\n"
+ "P = (%Nx, %Nx (incorrect)\n %Nx, %Nx)\n",
+ A->e00, A->n, A->e01, A->n, A->e10, A->n, A->e11, A->n,
+ B->e00, B->n, B->e01, B->n, B->e10, B->n, B->e11, B->n,
+ R.e00, R.n, R.e01, R.n, R.e10, R.n, R.e11, R.n,
+ P.e00, P.n, P.e01, P.n, P.e10, P.n, P.e11, P.n);
+ abort();
+ }
+ refmpn_free_limbs (tp);
+ matrix_clear (&R);
+ matrix_clear (&P);
+}
+
+#define MAX_SIZE (2+2*MATRIX22_STRASSEN_THRESHOLD)
+
+int
+main (int argc, char **argv)
+{
+ struct matrix A;
+ struct matrix B;
+
+ gmp_randstate_ptr rands;
+ mpz_t bs;
+ int i;
+
+ tests_start ();
+ rands = RANDS;
+
+ matrix_init (&A, MAX_SIZE);
+ matrix_init (&B, MAX_SIZE);
+ mpz_init (bs);
+
+ for (i = 0; i < 17; i++)
+ {
+ mp_size_t an, bn;
+ mpz_urandomb (bs, rands, 32);
+ an = 1 + mpz_get_ui (bs) % MAX_SIZE;
+ mpz_urandomb (bs, rands, 32);
+ bn = 1 + mpz_get_ui (bs) % MAX_SIZE;
+
+ matrix_random (&A, an, rands);
+ matrix_random (&B, bn, rands);
+
+ one_test (&A, &B, i);
+ }
+ mpz_clear (bs);
+ matrix_clear (&A);
+ matrix_clear (&B);
+
+ return 0;
+}
diff --git a/tests/mpz/t-gcd.c b/tests/mpz/t-gcd.c
index 13065bdab..a58832861 100644
--- a/tests/mpz/t-gcd.c
+++ b/tests/mpz/t-gcd.c
@@ -82,10 +82,10 @@ check_data (void)
to reinitialize them for each test. */
mpz_t gcd1, gcd2, s, t, temp1, temp2;
-#if GCD_SCHOENHAGE_THRESHOLD > GCDEXT_SCHOENHAGE_THRESHOLD
-#define MAX_SCHOENHAGE_THRESHOLD GCD_SCHOENHAGE_THRESHOLD
+#if GCD_DC_THRESHOLD > GCDEXT_DC_THRESHOLD
+#define MAX_SCHOENHAGE_THRESHOLD GCD_DC_THRESHOLD
#else
-#define MAX_SCHOENHAGE_THRESHOLD GCDEXT_SCHOENHAGE_THRESHOLD
+#define MAX_SCHOENHAGE_THRESHOLD GCDEXT_DC_THRESHOLD
#endif
/* Define this to make all operands be large enough for Schoenhage gcd
@@ -252,6 +252,7 @@ one_test (mpz_t op1, mpz_t op2, mpz_t ref, int i)
fprintf (stderr, "op1="); debug_mp (op1, -16);
fprintf (stderr, "op2="); debug_mp (op2, -16);
fprintf (stderr, "mpz_gcdext returns:\n");debug_mp (gcd1, -16);
+ fprintf (stderr, "s="); debug_mp (s, -16);
abort ();
}
diff --git a/tune/Makefile.am b/tune/Makefile.am
index 8748cbc4d..96d90ae77 100644
--- a/tune/Makefile.am
+++ b/tune/Makefile.am
@@ -41,7 +41,7 @@ EXTRA_LTLIBRARIES = libspeed.la
libspeed_la_SOURCES = \
common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c \
- freq.c gcd_bin.c gcd_accel.c gcd_finda_gen.c \
+ freq.c \
gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c \
jacbase1.c jacbase2.c jacbase3.c \
mod_1_div.c mod_1_inv.c modlinv.c \
@@ -124,7 +124,7 @@ DISTCLEANFILES = sqr_basecase.c $(MANY_DISTCLEAN)
TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c
TUNE_MPN_SRCS_BASIC = dc_divrem_n.c divrem_2.c gcd.c gcdext.c get_str.c \
- set_str.c hgcd.c mul_n.c mullow_n.c mul_fft.c mul.c sb_divrem_mn.c tdiv_qr.c
+ set_str.c matrix22_mul.c hgcd.c mul_n.c mullow_n.c mul_fft.c mul.c sb_divrem_mn.c tdiv_qr.c
$(TUNE_MPN_SRCS_BASIC):
for i in $(TUNE_MPN_SRCS_BASIC); do \
diff --git a/tune/common.c b/tune/common.c
index c7b9b4e61..9efd4f85a 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -999,18 +999,71 @@ speed_mpn_mullow_basecase (struct speed_params *s)
}
double
+speed_mpn_matrix22_mul (struct speed_params *s)
+{
+ /* Speed params only includes 2 inputs, so we have to invent the
+ other 6. */
+
+ mp_ptr a1, a2, a3;
+ mp_ptr r0, r1, r2, r3;
+ mp_ptr b1, b2, b3;
+ mp_ptr tp;
+ mp_size_t scratch;
+ unsigned i;
+ double t;
+ TMP_DECL;
+
+ TMP_MARK;
+ SPEED_TMP_ALLOC_LIMBS (a1, s->size, s->align_xp);
+ SPEED_TMP_ALLOC_LIMBS (a2, s->size, s->align_xp);
+ SPEED_TMP_ALLOC_LIMBS (a3, s->size, s->align_xp);
+
+ SPEED_TMP_ALLOC_LIMBS (b1, s->size, s->align_yp);
+ SPEED_TMP_ALLOC_LIMBS (b2, s->size, s->align_yp);
+ SPEED_TMP_ALLOC_LIMBS (b3, s->size, s->align_yp);
+
+ SPEED_TMP_ALLOC_LIMBS (r0, 2 * s->size +1, s->align_xp);
+ SPEED_TMP_ALLOC_LIMBS (r1, 2 * s->size +1, s->align_xp);
+ SPEED_TMP_ALLOC_LIMBS (r2, 2 * s->size +1, s->align_xp);
+ SPEED_TMP_ALLOC_LIMBS (r3, 2 * s->size +1, s->align_xp);
+
+ mpn_random (a1, s->size);
+ mpn_random (a2, s->size);
+ mpn_random (a3, s->size);
+ mpn_random (b1, s->size);
+ mpn_random (b2, s->size);
+ mpn_random (b3, s->size);
+
+ scratch = mpn_matrix22_mul_itch (s->size, s->size);
+ SPEED_TMP_ALLOC_LIMBS (tp, scratch, s->align_wp);
+
+ speed_starttime ();
+ i = s->reps;
+ do
+ {
+ MPN_COPY (r0, s->xp, s->size);
+ MPN_COPY (r1, a1, s->size);
+ MPN_COPY (r2, a2, s->size);
+ MPN_COPY (r3, a3, s->size);
+ mpn_matrix22_mul (r0, r1, r2, r3, s->size, s->yp, b1, b2, b3, s->size, tp);
+ }
+ while (--i != 0);
+ t = speed_endtime();
+ TMP_FREE;
+ return t;
+}
+
+double
speed_mpn_hgcd (struct speed_params *s)
{
mp_ptr wp;
- mp_size_t hgcd_init_scratch = mpn_hgcd_init_itch (s->size);
- mp_size_t qstack_scratch = qstack_itch (s->size);
+ mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);
mp_size_t hgcd_scratch = mpn_hgcd_itch (s->size);
mp_ptr ap;
mp_ptr bp;
mp_ptr tmp1, tmp2;
- struct hgcd hgcd;
- struct qstack quotients;
+ struct hgcd_matrix hgcd;
int res;
unsigned i;
double t;
@@ -1024,53 +1077,38 @@ speed_mpn_hgcd (struct speed_params *s)
SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);
SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);
- MPN_COPY (ap, s->xp, s->size);
- MPN_COPY (bp, s->yp, s->size);
- ap[s->size - 1] |= 1;
- bp[s->size - 1] |= 1;
-
- /* We must have a >= b */
- if (mpn_cmp (ap, bp, s->size) < 0)
- MP_PTR_SWAP (ap, bp);
+ s->xp[s->size - 1] |= 1;
+ s->yp[s->size - 1] |= 1;
SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp);
- mpn_hgcd_init (&hgcd, s->size, tmp1);
- SPEED_TMP_ALLOC_LIMBS (tmp2, qstack_scratch, s->align_wp);
- qstack_init (&quotients, s->size, tmp2, qstack_scratch);
+ mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);
SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp);
speed_starttime ();
i = s->reps;
do
{
- qstack_reset (&quotients, s->size);
- res = mpn_hgcd (&hgcd, ap, s->size, bp, s->size,
- &quotients,
- wp, hgcd_scratch);
+ MPN_COPY (ap, s->xp, s->size);
+ MPN_COPY (bp, s->yp, s->size);
+ res = mpn_hgcd (ap, bp, s->size, &hgcd, wp);
}
while (--i != 0);
t = speed_endtime ();
-#if WANT_ASSERT
- if (res)
- ASSERT_HGCD (&hgcd, ap, s->size, bp, s->size, 0, 4);
-#endif
TMP_FREE;
return t;
}
-#if 0
+
double
speed_mpn_hgcd_lehmer (struct speed_params *s)
{
mp_ptr wp;
- mp_size_t hgcd_init_scratch = mpn_hgcd_init_itch (s->size);
- mp_size_t qstack_scratch = qstack_itch (s->size);
- mp_size_t hgcd_scratch = mpn_hgcd_itch (s->size);
+ mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);
+ mp_size_t hgcd_scratch = MPN_HGCD_LEHMER_ITCH (s->size);
mp_ptr ap;
mp_ptr bp;
mp_ptr tmp1, tmp2;
- struct hgcd hgcd;
- struct qstack quotients;
+ struct hgcd_matrix hgcd;
int res;
unsigned i;
double t;
@@ -1084,45 +1122,33 @@ speed_mpn_hgcd_lehmer (struct speed_params *s)
SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);
SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);
- MPN_COPY (ap, s->xp, s->size);
- MPN_COPY (bp, s->yp, s->size);
- ap[s->size - 1] |= 1;
- bp[s->size - 1] |= 1;
-
- /* We must have a >= b */
- if (mpn_cmp (ap, bp, s->size) < 0)
- MP_PTR_SWAP (ap, bp);
+ s->xp[s->size - 1] |= 1;
+ s->yp[s->size - 1] |= 1;
SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp);
- mpn_hgcd_init (&hgcd, s->size, tmp1);
- SPEED_TMP_ALLOC_LIMBS (tmp2, qstack_scratch, s->align_wp);
- qstack_init (&quotients, s->size, tmp2, qstack_scratch);
+ mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);
SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp);
speed_starttime ();
i = s->reps;
do
{
- qstack_reset (&quotients, s->size);
- res = mpn_hgcd_lehmer (&hgcd, ap, s->size, bp, s->size,
- &quotients,
- wp, hgcd_scratch);
+ MPN_COPY (ap, s->xp, s->size);
+ MPN_COPY (bp, s->yp, s->size);
+ res = mpn_hgcd_lehmer (ap, bp, s->size, &hgcd, wp);
}
while (--i != 0);
t = speed_endtime ();
-#if WANT_ASSERT
- if (res)
- ASSERT_HGCD (&hgcd, ap, s->size, bp, s->size, 0, 4);
-#endif
TMP_FREE;
return t;
}
-#endif
+
double
speed_mpn_gcd (struct speed_params *s)
{
SPEED_ROUTINE_MPN_GCD (mpn_gcd);
}
+#if 0
double
speed_mpn_gcd_binary (struct speed_params *s)
{
@@ -1133,7 +1159,7 @@ speed_mpn_gcd_accel (struct speed_params *s)
{
SPEED_ROUTINE_MPN_GCD (mpn_gcd_accel);
}
-
+#endif
#if HAVE_NATIVE_mpn_gcd_finda
double
speed_mpn_gcd_finda (struct speed_params *s)
diff --git a/tune/speed.c b/tune/speed.c
index 90e3990de..abe9e70b8 100644
--- a/tune/speed.c
+++ b/tune/speed.c
@@ -255,17 +255,20 @@ const struct routine_t {
{ "mpn_popcount", speed_mpn_popcount },
{ "mpn_hamdist", speed_mpn_hamdist },
+ { "mpn_matrix22_mul", speed_mpn_matrix22_mul },
+
{ "mpn_hgcd", speed_mpn_hgcd },
-#if 0
{ "mpn_hgcd_lehmer", speed_mpn_hgcd_lehmer },
-#endif
+
{ "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL },
{ "mpn_gcd_1N", speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
{ "mpn_gcd", speed_mpn_gcd },
+#if 0
{ "mpn_gcd_binary", speed_mpn_gcd_binary },
{ "mpn_gcd_accel", speed_mpn_gcd_accel },
{ "find_a", speed_find_a, FLAG_NODATA },
+#endif
#if HAVE_NATIVE_mpn_gcd_finda
{ "mpn_gcd_finda", speed_mpn_gcd_finda, FLAG_NODATA },
#endif
diff --git a/tune/speed.h b/tune/speed.h
index c2055ca4a..ff8a8f73c 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -182,6 +182,7 @@ double speed_mpn_divrem_2 _PROTO ((struct speed_params *s));
double speed_mpn_divrem_2_div _PROTO ((struct speed_params *s));
double speed_mpn_divrem_2_inv _PROTO ((struct speed_params *s));
double speed_mpn_fib2_ui _PROTO ((struct speed_params *s));
+double speed_mpn_matrix22_mul _PROTO ((struct speed_params *s));
double speed_mpn_hgcd _PROTO ((struct speed_params *s));
double speed_mpn_hgcd_lehmer _PROTO ((struct speed_params *s));
double speed_mpn_gcd _PROTO ((struct speed_params *s));
diff --git a/tune/tuneup.c b/tune/tuneup.c
index fa6778dba..6d1acf9e0 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -162,10 +162,11 @@ mp_size_t mullow_mul_n_threshold = MP_SIZE_T_MAX;
mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX;
mp_size_t div_dc_threshold = MP_SIZE_T_MAX;
mp_size_t powm_threshold = MP_SIZE_T_MAX;
-mp_size_t hgcd_schoenhage_threshold = MP_SIZE_T_MAX;
+mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX;
+mp_size_t hgcd_threshold = MP_SIZE_T_MAX;
mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX;
-mp_size_t gcd_schoenhage_threshold = MP_SIZE_T_MAX;
-mp_size_t gcdext_schoenhage_threshold = MP_SIZE_T_MAX;
+mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX;
+mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX;
mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX;
mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX;
mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX;
@@ -1007,17 +1008,27 @@ tune_powm (void)
void
+tune_matrix22_mul (void)
+{
+ static struct param_t param;
+ param.name = "MATRIX22_STRASSEN_THRESHOLD";
+ param.function = speed_mpn_matrix22_mul;
+ param.min_size = 2;
+ one (&matrix22_strassen_threshold, &param);
+}
+
+void
tune_hgcd (void)
{
static struct param_t param;
- param.name = "HGCD_SCHOENHAGE_THRESHOLD";
+ param.name = "HGCD_THRESHOLD";
param.function = speed_mpn_hgcd;
/* We seem to get strange results for small sizes */
- param.min_size = 50;
- param.step_factor = 0.05;
- one (&hgcd_schoenhage_threshold, &param);
+ param.min_size = 30;
+ one (&hgcd_threshold, &param);
}
+#if 0
void
tune_gcd_accel (void)
{
@@ -1027,29 +1038,29 @@ tune_gcd_accel (void)
param.min_size = 1;
one (&gcd_accel_threshold, &param);
}
-
+#endif
void
-tune_gcd_schoenhage (void)
+tune_gcd_dc (void)
{
static struct param_t param;
- param.name = "GCD_SCHOENHAGE_THRESHOLD";
+ param.name = "GCD_DC_THRESHOLD";
param.function = speed_mpn_gcd;
- param.min_size = hgcd_schoenhage_threshold;
+ param.min_size = hgcd_threshold;
param.max_size = 3000;
param.step_factor = 0.1;
- one (&gcd_schoenhage_threshold, &param);
+ one (&gcd_dc_threshold, &param);
}
void
-tune_gcdext_schoenhage (void)
+tune_gcdext_dc (void)
{
static struct param_t param;
- param.name = "GCDEXT_SCHOENHAGE_THRESHOLD";
+ param.name = "GCDEXT_DC_THRESHOLD";
param.function = speed_mpn_gcdext;
- param.min_size = hgcd_schoenhage_threshold;
+ param.min_size = hgcd_threshold;
param.max_size = 3000;
param.step_factor = 0.1;
- one (&gcdext_schoenhage_threshold, &param);
+ one (&gcdext_dc_threshold, &param);
}
@@ -1771,10 +1782,13 @@ all (void)
tune_powm ();
printf("\n");
+ tune_matrix22_mul ();
tune_hgcd ();
+ tune_gcd_dc ();
+ tune_gcdext_dc ();
+#if 0
tune_gcd_accel ();
- tune_gcd_schoenhage ();
- tune_gcdext_schoenhage ();
+#endif
tune_jacobi_base ();
printf("\n");