diff options
63 files changed, 3274 insertions, 4852 deletions
@@ -37,9 +37,9 @@ Makefile ^doc/version\.texi # All source files in mpn/ are either generated, or links -^mpn/.*\.c -^mpn/.*\.asm -^ +^mpn/[^/]*\.c +^mpn/[^/]*\.asm + ^\.libs .*\.a @@ -29,6 +29,38 @@ (DIVEXACT_BY3_METHOD): Don't default to 0 if HAVE_NATIVE_mpn_divexact_by3c. +2008-09-18 Niels Möller <nisse@lysator.liu.se> + + * mpn/generic/gcd.c (main): Added code for tuning of CHOOSE_P. + + * mpn/generic/hgcd.c (mpn_hgcd_matrix_mul): Assert that inputs are + normalized. + +2008-09-17 Niels Möller <nisse@lysator.liu.se> <nisse@king.swox.se> + + * mpn/generic/gcdext.c (mpn_gcdext): p = n/5 caused a + slowdown for large inputs. As a compromise, use p = n/2 for the + first iteration, and p = n/3 for the rest. Handle the first + iteration specially, since the initial u0 and u1 are trivial. + + * mpn/x86_64/gmp-mparam.h (GCDEXT_DC_THRESHOLD): Reduced threshold + from 409 to 390. + + * mpn/generic/gcdext.c (CHOOSE_P): New macro. Use p = n/5. + (mpn_gcdext): Use CHOOSE_P, and generalized the calculation of + scratch space. + + * tune/tuneup.c (tune_hgcd): Use default step factor. + + * mpn/x86_64/gmp-mparam.h: (GCD_DC_THRESHOLD): Reduced from 493 to + 412. + + * mpn/generic/gcd.c (CHOOSE_P): New macro, to determine the + split when calling hgcd. Use p = 2n/3, as that seems better than + the more obvious split p = n/2. + (mpn_gcd): Use CHOOSE_P, and generalized the calculation of + scratch space. + 2008-09-16 Torbjorn Granlund <tege@swox.com> * mpn/generic/toom_interpolate_7pts.c: Use new mpn_divexact_byN @@ -55,14 +87,175 @@ Choose function depending on DIVEXACT_BY3_METHOD. * gmp-impl.h (DIVEXACT_BY3_METHOD): Provide default. +2008-09-16 Niels Möller <nisse@lysator.liu.se> + + * mpn/generic/hgcd.c (mpn_hgcd_addmul2_n): Moved function to + gcdext.c, where it is used. + * mpn/generic/gcdext.c (addmul2_n): Moved and renamed, was + mpn_hgcd_addmul2_n. Made static. Deleted input normalization. + Deleted rn argument. + (mpn_gcdext): Updated calls to addmul2_n, and added assertions. + + * gmp-impl.h (MPN_HGCD_MATRIX_INIT_ITCH): Increased storage by four limbs. + (MPN_HGCD_LEHMER_ITCH): Reduced storage by one limb. + (MPN_GCD_SUBDIV_STEP_ITCH): Likewise. + (MPN_GCD_LEHMER_N_ITCH): Likewise. + + * mpn/generic/hgcd.c (mpn_hgcd_matrix_init): Use two extra limbs. + (hgcd_step): Use overlapping arguments to mpn_tdiv_qr. + (mpn_hgcd_matrix_mul): Deleted normalization code. Tigher bounds + for the element size of the product. Needs two extra limbs of + storage for the elements. + (mpn_hgcd_itch): Updated storage calculation. + + * mpn/generic/gcd_subdiv_step.c (mpn_gcd_subdiv_step): Use + overlapping arguments to mpn_tdiv_qr. Use mpn_zero_p. + + * mpn/generic/gcd.c (mpn_gcd): Use mpn_zero_p. + +2008-09-15 Niels Möller <nisse@lysator.liu.se> + + * mpn/generic/hgcd.c (mpn_hgcd_matrix_init): Updated for deleted + tp pointer. + (hgcd_matrix_update_q): Likewise. + (mpn_hgcd_matrix_mul): Likewise. + (mpn_hgcd_itch): Updated calculation of scratch space. + + * gmp-impl.h (struct hgcd_matrix): Deleted tp pointer. + (MPN_HGCD_MATRIX_INIT_ITCH): Reduced storage. + (mpn_hgcd_step, MPN_HGCD_STEP_ITCH): Deleted declarations. + +2008-09-15 Niels Möller <nisse@lysator.liu.se> <nisse@king.swox.se> + + * mpn/x86_64/gmp-mparam.h (MATRIX22_STRASSEN_THRESHOLD): New + threshold. + + * mpn/generic/hgcd.c (mpn_hgcd_matrix_mul): Use mpn_matrix22_mul. + (mpn_hgcd_itch): Updated calculation of scratch space. Use + count_leading_zeros to get the recursion depth. + + * mpn/generic/gcd.c (mpn_gcd): Fixed calculation of scratch space, + and use mpn_hgcd_itch. + +2008-09-15 Niels Möller <nisse@lysator.liu.se> + + * tune/tuneup.c (tune_matrix22_mul): New function. + (all): Use it. + + * tune/common.c (speed_mpn_matrix22_mul): New function. + + * tune/Makefile.am (TUNE_MPN_SRCS_BASIC): Added matrix22_mul.c. + + * tests/mpn/t-matrix22.c: Use MATRIX22_STRASSEN_THRESHOLD to + select sizes for tests. + + * gmp-impl.h (MATRIX22_STRASSEN_THRESHOLD): New threshold + + * configure.in (gmp_mpn_functions): Added matrix22_mul. + * gmp-impl.h: Added declarations for mpn_matrix22_mul and related + functions. + + * mpn/Makefile.am (nodist_EXTRA_libmpn_la_SOURCES): Added + matrix22_mul.c. + * tests/mpn/Makefile.am (check_PROGRAMS): Added t-matrix22. + + * tests/mpn/t-matrix22.c: New file. + * mpn/generic/matrix22_mul.c: New file. + +2008-09-11 Niels Möller <nisse@king.swox.se> + + * tune/tuneup.c: Updated tuning of gcdext. + + * mpn/x86_64/gmp-mparam.h (GCDEXT_DC_THRESHOLD): Reduced threshold + from 713 to 409. + +2008-09-11 Niels Möller <nisse@lysator.liu.se> + + * gmp-impl.h: Updated for gcdext changes. + (GCDEXT_DC_THRESHOLD): New constant, renamed from + GCDEXT_SCHOENHAGE_THRESHOLD. + + * mpn/generic/gcdext.c (compute_v): Accept non-normalized a and b + as inputs. + (mpn_gcdext): Rewrote and simplified. Now uses the new mpn_hgcd + interface. + + * mpn/generic/hgcd.c (mpn_hgcd_addmul2_n): Renamed from addmul2_n + and made non-static. Changed interface to take non-normalized + inputs, and only two size arguments. + (mpn_hgcd_matrix_mul): Simplified using new mpn_hgcd_addmul2_n. + + * mpn/generic/gcdext_lehmer.c (mpn_gcdext_lehmer_itch): Deleted + function. + (mpn_gcdext_lehmer_n): Renamed from mpn_gcd_lehmer. Now takes + inputs of equal size. Moved the code for the division step to a + separate function... + * mpn/generic/gcdext_subdiv_step.c (mpn_gcdext_subdiv_step): New + file, new function. + + * configure.in (gmp_mpn_functions): Added gcdext_subdiv_step. + 2008-09-10 Torbjorn Granlund <tege@swox.com> * gmp-h.in: Unconditionally include <cstdio>. +2008-09-10 Niels Möller <nisse@lysator.liu.se> + + * tune/common.c: #if:ed out speed_mpn_gcd_binary and + speed_mpn_gcd_accel. + * tune/speed.c (routine): #if:ed out mpn_gcd_binary, mpn_gcd_accel + and find_a. + * tune/Makefile.am (libspeed_la_SOURCES): Removed gcd_bin.c + gcd_accel.c gcd_finda_gen.c. + * tune/tuneup.c: Enable tuning of GCD_DC_THRESHOLD. + + * mpn/generic/gcd.c (mpn_gcd): Rewrote and simplified. Now uses + the new mpn_hgcd interface. + + * */gmp-mparam.h: Renamed GCD_SCHOENHAGE_THRESHOLD to + GCD_DC_THRESHOLD. + + * mpn/generic/gcd_lehmer.c (mpn_gcd_lehmer_n): Renamed (was + mpn_gcd_lehmer). Now takes inputs of equal size. + + * mpn/generic/gcd_lehmer.c (mpn_gcd_lehmer): Reintroduced gcd_2, + to get better performance for small inputs. + + * mpn/generic/hgcd.c: Don't hardcode small HGCD_THRESHOLD. + * mpn/x86_64/gmp-mparam.h (HGCD_THRESHOLD): Reduced from 145 to + 120. + * */gmp-mparam.h: Renamed HGCD_SCHOENHAGE_THRESHOLD to + HGCD_THRESHOLD. + 2008-09-09 Torbjorn Granlund <tege@swox.com> * doc/gmp.texi: Fix a typo and clarify mpn_gcdext docs. +2008-09-09 Niels Möller <nisse@lysator.liu.se> + + * tune/common.c (speed_mpn_hgcd, speed_mpn_hgcd_lehmer): Adapted + to new hgcd interface. + + * gmp-impl.h (MPN_HGCD_LEHMER_ITCH): New macro. + + * hgcd.c (mpn_hgcd_lehmer): Renamed function, from hgcd_base. Made + non-static. + + * gcd_lehmer.c (mpn_gcd_lehmer): Use hgcd2 also for n == 2. + + * gcdext_lehmer.c (mpn_gcdext_lehmer): Simplified code for + division step. Added proper book-keeping of swaps, which affect + the sign of the returned cofactor. + + * tests/mpz/t-gcd.c (one_test): Display co-factor when mpn_gcdext + fails. + + * gcd_lehmer.c (mpn_gcd_lehmer): At end of loop, need to handle + the special case n == 1 correctly. + + * gcd_subdiv_step.c (mpn_gcd_subdiv_step): Simplified function. + The special cancellation logic is not needed here. + 2008-09-08 Torbjorn Granlund <tege@swox.com> * mpn/generic/invert.c: Add working but slow code. @@ -94,6 +287,26 @@ * gmp-h.in (__GMP_CC): New #define. (__GMP_CFLAGS): New #define. +2008-09-08 Niels Möller <nisse@lysator.liu.se> + + * tests/mpn/t-hgcd.c: Updated tests. Rewrite of hgcd_ref. + + * mpn/generic/gcdext_lehmer.c (mpn_gcdext_lehmer_itch): New function. + (mpn_gcdext_lehmer): Various bugfixes. + + * gcdext.c (mpn_gcdext): Allocate scratch space for gcdext_lehmer. + + * mpn/generic/gcd_lehmer.c (gcd_2): ASSERT that inputs are odd. + (mpn_gcd_lehmer): Added tp argument, for scratch space. Make both + arguments odd before calling gcd_2. + + * mpn/generic/hgcd.c (mpn_hgcd): Allow the trivial case n <= 2, + and return 0 immediately. + + * gmp-impl.h (MPN_EXTRACT_NUMB): New macro. + + * configure.in (gmp_mpn_functions): Added gcdext_lehmer. + 2008-09-05 Torbjorn Granlund <tege@swox.com> * mpn/generic/toom_interpolate_7pts.c: Use mpn_divexact_by3c instead of @@ -856,6 +1069,12 @@ * mpn/generic/mul_fft.c: Optimize many scalar divisions and mod operations into masks and shifts. (mpn_fft_mul_modF_K): Fix a spurious ASSERT_NOCARRY. + (mpn_fft_belge_butterfly, mpn_fft_fft_belgeRec, mpn_fft_fft_belge, + mpn_fft_fft_belgeInvRec, mpn_fft_fft_belgeInv): Add Pierrick Gaudry's + implementation of the cache-optimized "belge" FFT code. + (mpn_fft_fft_sqr, mpn_fft_butterfly, mpn_fft_fft, mpn_fft_fftinv): + Remove. + (mpn_mul_fft_internal): Corresponding updates. 2006-03-26 Torbjorn Granlund <tege@swox.com> @@ -1187,6 +1406,9 @@ * tests/mpz/reuse.c: Test mpz_rootrem. + From Paul Zimmermann: + * mpn/generic/rootrem.c: Complete rewrite. + 2005-10-31 Torbjorn Granlund <tege@swox.com> * mpz/pprime_p.c (mpz_probab_prime_p): Considerably limit trial diff --git a/configure.in b/configure.in index fe0584285..7eea50ee4 100644 --- a/configure.in +++ b/configure.in @@ -2407,8 +2407,11 @@ gmp_mpn_functions="$extra_functions \ fib2_ui mod_1 mod_34lsub1 mode1o pre_divrem_1 pre_mod_1 dump \ mul mul_fft mul_n mul_basecase sqr_basecase random random2 pow_1 \ rootrem sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \ - bdivmod gcd_1 gcd gcdext tdiv_qr dc_divrem_n sb_divrem_mn jacbase get_d \ - hgcd2 hgcd qstack mullow_n mullow_basecase \ + bdivmod gcd_1 gcd gcdext_1 gcdext gcd_lehmer gcd_subdiv_step \ + gcdext_lehmer gcdext_subdiv_step \ + tdiv_qr dc_divrem_n sb_divrem_mn jacbase get_d \ + matrix22_mul \ + hgcd2 hgcd mullow_n mullow_basecase \ mul_toom22 mul_toom32 mul_toom42 mul_toom62 mul_toom53 mul_toom44 \ toom_interpolate_5pts toom_interpolate_7pts invert binvert \ sb_div_qr sb_divappr_q sb_div_q dc_div_qr dc_divappr_q dc_div_q \ @@ -1505,6 +1505,9 @@ __GMP_DECLSPEC mp_size_t mpn_gcd __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr #define mpn_gcd_1 __MPN(gcd_1) __GMP_DECLSPEC mp_limb_t mpn_gcd_1 __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t)) __GMP_ATTRIBUTE_PURE; +#define mpn_gcdext_1 __MPN(gcdext_1) +__GMP_DECLSPEC mp_limb_t mpn_gcdext_1 __GMP_PROTO ((mp_ptr, mp_ptr, mp_limb_t, mp_limb_t)) __GMP_ATTRIBUTE_PURE; + #define mpn_gcdext __MPN(gcdext) __GMP_DECLSPEC mp_size_t mpn_gcdext __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t)); diff --git a/gmp-impl.h b/gmp-impl.h index 0433e8527..4dcfc6497 100644 --- a/gmp-impl.h +++ b/gmp-impl.h @@ -71,6 +71,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ mp_limb_t name __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t)) #define DECL_gcd_1(name) \ mp_limb_t name __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t)) +#define DECL_gcdext_1(name) \ + mp_limb_t name __GMP_PROTO ((mp_ptr, mp_ptr, mp_limb_t, mp_limb_t)) #define DECL_lshift(name) \ mp_limb_t name __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, unsigned)) #define DECL_mod_1(name) \ @@ -3439,176 +3441,156 @@ void __gmp_invalid_operation _PROTO ((void)) ATTRIBUTE_NORETURN; } \ } while (0) - -/* HGCD definitions */ - -/* Limited by 2 + twice the bitsize of mp_size_t */ -#define QSTACK_MAX_QUOTIENTS 82 - -/* Name mangling */ -#define qstack_itch __gmpn_qstack_itch -#define qstack_init __gmpn_qstack_init -#define qstack_reset __gmpn_qstack_reset -#define qstack_rotate __gmpn_qstack_rotate - -#define mpn_hgcd2 __gmpn_hgcd2 -#define mpn_hgcd2_fix __gmpn_hgcd2_fix -#define mpn_hgcd2_lehmer_step __gmpn_hgcd2_lehmer_step -#define mpn_hgcd_max_recursion __gmpn_hgcd_max_recursion -#define mpn_hgcd_init_itch __gmpn_hgcd_init_itch -#define mpn_hgcd_init __gmpn_hgcd_init -#define mpn_hgcd_lehmer_itch __gmpn_hgcd_lehmer_itch -#define mpn_hgcd_lehmer __gmpn_hgcd_lehmer -#define mpn_hgcd_itch __gmpn_hgcd_itch -#define mpn_hgcd __gmpn_hgcd -#define mpn_hgcd_equal __gmpn_hgcd_equal -#define mpn_hgcd_fix __gmpn_hgcd_fix - -struct qstack -{ - /* Throughout the code we represent q = 1 with qsize = 0. */ - mp_size_t size[QSTACK_MAX_QUOTIENTS]; - mp_ptr limb; - mp_size_t limb_alloc; - - /* Number of quotients to keep when we discard old quotients */ - unsigned nkeep; - - /* Top quotient is of size size[size_next-1], and starts at - limb+limb_next - size[size_next-1]. We use size_next == 0 for an - empty stack.*/ - unsigned size_next; - mp_size_t limb_next; -}; +/* Matrix multiplication */ +#define mpn_matrix22_mul __MPN(matrix22_mul) +#define mpn_matrix22_strassen __MPN(matrix22_mul_strassen) +#define mpn_matrix22_mul_itch __MPN(matrix22_mul_itch) mp_size_t -qstack_itch __GMP_PROTO ((mp_size_t)); +mpn_matrix22_mul_itch (mp_size_t, mp_size_t); void -qstack_init __GMP_PROTO ((struct qstack *, mp_size_t, mp_limb_t *, mp_size_t)); - +mpn_matrix22_mul (mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, + mp_srcptr, mp_srcptr, mp_srcptr, mp_srcptr, mp_size_t, + mp_ptr); void -qstack_reset __GMP_PROTO ((struct qstack *, mp_size_t)); +mpn_matrix22_mul_strassen (mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t, + mp_srcptr, mp_srcptr, mp_srcptr, mp_srcptr, mp_size_t, + mp_ptr); -void -qstack_rotate __GMP_PROTO ((struct qstack *, mp_size_t)); - -#if WANT_ASSERT -void -__gmpn_qstack_sanity __GMP_PROTO ((struct qstack *)); -#define ASSERT_QSTACK __gmpn_qstack_sanity -#else -#define ASSERT_QSTACK(stack) +#ifndef MATRIX22_STRASSEN_THRESHOLD +#define MATRIX22_STRASSEN_THRESHOLD 30 #endif -struct hgcd2_row -{ - /* r = (-)u a + (-)v b */ - mp_limb_t u; - mp_limb_t v; -}; +/* HGCD definitions */ + +/* Extract one numb, shifting count bits left + ________ ________ + |___xh___||___xl___| + |____r____| + >count < + + The count includes any nail bits, so it should work fine if count + is computed using count_leading_zeros. If GMP_NAIL_BITS > 0, all of + xh, xl and r include nail bits. Must have 0 < count < GMP_LIMB_BITS. -struct hgcd2 + FIXME: Omit masking with GMP_NUMB_MASK, and let callers do that for + those calls where the count high bits of xh may be non-zero. +*/ + +#define MPN_EXTRACT_NUMB(count, xh, xl) \ + ((((xh) << ((count) - GMP_NAIL_BITS)) & GMP_NUMB_MASK) | \ + ((xl) >> (GMP_LIMB_BITS - (count)))) + +#define mpn_hgcd2 __MPN (hgcd2) +#define mpn_hgcd_mul_matrix1_vector __MPN (hgcd_mul_matrix1_vector) +#define mpn_hgcd_mul_matrix1_inverse_vector __MPN (hgcd_mul_matrix1_inverse_vector) + +#define mpn_hgcd_matrix_init __MPN (hgcd_matrix_init) +#define mpn_hgcd_matrix_mul __MPN (hgcd_matrix_mul) +#define mpn_hgcd_matrix_adjust __MPN (hgcd_matrix_adjust) + +#define mpn_hgcd_step __MPN (hgcd_step) +#define mpn_hgcd_itch __MPN (hgcd_itch) +#define mpn_hgcd __MPN (hgcd) +#define mpn_hgcd_lehmer __MPN (hgcd_lehmer) + +#define mpn_gcd_lehmer_n __MPN(gcd_lehmer_n) +#define mpn_gcd_subdiv_step __MPN(gcd_subdiv_step) +#define mpn_gcdext_lehmer_n __MPN(gcdext_lehmer_n) +#define mpn_gcdext_subdiv_step __MPN(gcdext_subdiv_step) + +/* The matrix non-negative M = (u, u'; v,v') keeps track of the + reduction (a;b) = M (alpha; beta) where alpha, beta are smaller + than a, b. The determinant must always be one, so that M has an + inverse (v', -u'; -v, u). Elements always fit in GMP_NUMB_BITS - 1 + bits. */ +struct hgcd_matrix1 { - /* Sign of the first row, sign >= 0 implies that u >= 0 and v <= 0, - sign < 0 implies u <= 0, v >= 0 */ - int sign; - struct hgcd2_row row[4]; + mp_limb_t u[2][2]; }; int -mpn_hgcd2 __GMP_PROTO ((struct hgcd2 *, - mp_limb_t, mp_limb_t, - mp_limb_t, mp_limb_t, - struct qstack *)); +mpn_hgcd2 __GMP_PROTO ((mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, + struct hgcd_matrix1 *)); mp_size_t -mpn_hgcd2_fix __GMP_PROTO ((mp_ptr, mp_size_t, - int, - mp_limb_t, mp_srcptr, mp_size_t, - mp_limb_t, mp_srcptr, mp_size_t)); - -int -mpn_hgcd2_lehmer_step __GMP_PROTO ((struct hgcd2 *, - mp_srcptr, mp_size_t, - mp_srcptr, mp_size_t, - struct qstack *)); - -unsigned -mpn_hgcd_max_recursion __GMP_PROTO ((mp_size_t)); +mpn_hgcd_mul_matrix1_vector __GMP_PROTO ((struct hgcd_matrix1 *, mp_size_t, + mp_ptr, mp_ptr, mp_ptr)); -struct hgcd_row -{ - /* [rp, rsize] should always be normalized. */ - mp_ptr rp; mp_size_t rsize; - mp_ptr uvp[2]; -}; +mp_size_t +mpn_hgcd_mul_matrix1_inverse_vector __GMP_PROTO ((struct hgcd_matrix1 *, mp_size_t, + mp_ptr, mp_ptr, mp_ptr)); -struct hgcd +struct hgcd_matrix { - int sign; - /* Space allocated for the uv entries, for sanity checking */ + /* For sanity checking only */ mp_size_t alloc; - /* Size of the largest u,v entry, usually row[3].uvp[1]. This - element should be normalized. Smaller elements must be zero - padded, and all unused limbs (i.e. between size and alloc) must - be zero. */ - mp_size_t size; - struct hgcd_row row[4]; + + mp_size_t n; + mp_ptr p[2][2]; }; -mp_size_t -mpn_hgcd_init_itch __GMP_PROTO ((mp_size_t)); +#define MPN_HGCD_MATRIX_INIT_ITCH(n) (4 * ((n+1)/2 + 1)) void -mpn_hgcd_init __GMP_PROTO ((struct hgcd *, - mp_size_t, - mp_limb_t *)); +mpn_hgcd_matrix_init __GMP_PROTO ((struct hgcd_matrix *, mp_size_t, mp_ptr)); +void +mpn_hgcd_matrix_mul __GMP_PROTO ((struct hgcd_matrix *, const struct hgcd_matrix *, + mp_ptr)); mp_size_t -mpn_hgcd_lehmer_itch __GMP_PROTO ((mp_size_t)); - -int -mpn_hgcd_lehmer __GMP_PROTO ((struct hgcd *, - mp_srcptr, mp_size_t, - mp_srcptr, mp_size_t, - struct qstack *, - mp_ptr, mp_size_t)); +mpn_hgcd_matrix_adjust __GMP_PROTO ((struct hgcd_matrix *, + mp_size_t, mp_ptr, mp_ptr, + mp_size_t, mp_ptr)); mp_size_t mpn_hgcd_itch __GMP_PROTO ((mp_size_t)); -int -mpn_hgcd __GMP_PROTO ((struct hgcd *, - mp_srcptr, mp_size_t, - mp_srcptr, mp_size_t, - struct qstack *, - mp_ptr, mp_size_t)); +mp_size_t +mpn_hgcd __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, + struct hgcd_matrix *, mp_ptr)); -#if WANT_ASSERT -void -__gmpn_hgcd_sanity __GMP_PROTO ((const struct hgcd *, - mp_srcptr, mp_size_t, - mp_srcptr, mp_size_t, - unsigned, unsigned)); -#define ASSERT_HGCD __gmpn_hgcd_sanity -#else -#define ASSERT_HGCD(hgcd, ap, asize, bp, bsize, start, end) -#endif +#define MPN_HGCD_LEHMER_ITCH(n) (n) -int -mpn_hgcd_equal __GMP_PROTO ((const struct hgcd *, const struct hgcd *)); +mp_size_t +mpn_hgcd_lehmer __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, + struct hgcd_matrix *, mp_ptr)); + +/* Needs storage for the quotient */ +#define MPN_GCD_SUBDIV_STEP_ITCH(n) (n) + +mp_size_t +mpn_gcd_subdiv_step __GMP_PROTO ((mp_ptr, mp_size_t *, + mp_ptr, mp_ptr, mp_size_t, mp_ptr)); + +#define MPN_GCD_LEHMER_N_ITCH(n) (n) + +mp_size_t +mpn_gcd_lehmer_n __GMP_PROTO ((mp_ptr, mp_ptr, mp_ptr, mp_size_t, + mp_ptr)); + +/* To calculate the needed scratch space, n should be a bound for both + input and output sizes. */ +#define MPN_GCDEXT_SUBDIV_ITCH(n) (2*(n) + 1) + +mp_size_t +mpn_gcdext_subdiv_step __GMP_PROTO ((mp_ptr, mp_size_t *, mp_ptr, mp_size_t *, + mp_ptr, mp_ptr, mp_size_t, + mp_ptr, mp_ptr, mp_size_t *, mp_ptr)); + +#define MPN_GCDEXT_LEHMER_N_ITCH(n) (4*(n) + 3) mp_size_t -mpn_hgcd_fix __GMP_PROTO ((mp_size_t, - mp_ptr, mp_size_t, - int, mp_size_t, - const struct hgcd_row *, - mp_srcptr, mp_srcptr, - mp_ptr, mp_size_t)); +mpn_gcdext_lehmer_n __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, + mp_ptr, mp_ptr, mp_size_t, + mp_ptr)); + +/* 4*(an + 1) + 4*(bn + 1) + an */ +#define MPN_GCDEXT_LEHMER_ITCH(an, bn) (5*(an) + 4*(bn) + 8) -#ifndef HGCD_SCHOENHAGE_THRESHOLD -#define HGCD_SCHOENHAGE_THRESHOLD 150 +#ifndef HGCD_THRESHOLD +#define HGCD_THRESHOLD 400 #endif #if 0 @@ -3617,12 +3599,12 @@ mpn_hgcd_fix __GMP_PROTO ((mp_size_t, #endif #endif -#ifndef GCD_SCHOENHAGE_THRESHOLD -#define GCD_SCHOENHAGE_THRESHOLD 1000 +#ifndef GCD_DC_THRESHOLD +#define GCD_DC_THRESHOLD 1000 #endif -#ifndef GCDEXT_SCHOENHAGE_THRESHOLD -#define GCDEXT_SCHOENHAGE_THRESHOLD 600 +#ifndef GCDEXT_DC_THRESHOLD +#define GCDEXT_DC_THRESHOLD 600 #endif /* Definitions for mpn_set_str and mpn_get_str */ @@ -4044,9 +4026,13 @@ extern mp_size_t div_dc_threshold; #define POWM_THRESHOLD powm_threshold extern mp_size_t powm_threshold; -#undef HGCD_SCHOENHAGE_THRESHOLD -#define HGCD_SCHOENHAGE_THRESHOLD hgcd_schoenhage_threshold -extern mp_size_t hgcd_schoenhage_threshold; +#undef MATRIX22_STRASSEN_THRESHOLD +#define MATRIX22_STRASSEN_THRESHOLD matrix22_strassen_threshold +extern mp_size_t matrix22_strassen_threshold; + +#undef HGCD_THRESHOLD +#define HGCD_THRESHOLD hgcd_threshold +extern mp_size_t hgcd_threshold; #undef GCD_ACCEL_THRESHOLD #define GCD_ACCEL_THRESHOLD gcd_accel_threshold @@ -4058,13 +4044,13 @@ extern mp_size_t gcd_accel_threshold; extern mp_size_t gcd_lehmer_threshold; #endif -#undef GCD_SCHOENHAGE_THRESHOLD -#define GCD_SCHOENHAGE_THRESHOLD gcd_schoenhage_threshold -extern mp_size_t gcd_schoenhage_threshold; +#undef GCD_DC_THRESHOLD +#define GCD_DC_THRESHOLD gcd_dc_threshold +extern mp_size_t gcd_dc_threshold; -#undef GCDEXT_SCHOENHAGE_THRESHOLD -#define GCDEXT_SCHOENHAGE_THRESHOLD gcdext_schoenhage_threshold -extern mp_size_t gcdext_schoenhage_threshold; +#undef GCDEXT_DC_THRESHOLD +#define GCDEXT_DC_THRESHOLD gcdext_dc_threshold +extern mp_size_t gcdext_dc_threshold; #undef DIVREM_1_NORM_THRESHOLD #define DIVREM_1_NORM_THRESHOLD divrem_1_norm_threshold diff --git a/mpn/Makefile.am b/mpn/Makefile.am index 78f88e24c..d883ec2b8 100644 --- a/mpn/Makefile.am +++ b/mpn/Makefile.am @@ -40,7 +40,8 @@ nodist_EXTRA_libmpn_la_SOURCES = \ dump.c fib2_ui.c gcd.c \ gcd_finda.c gcd_1.c gcdext.c get_d.c get_str.c \ hamdist.c hgcd2.c hgcd.c invert_limb.c \ - ior_n.c iorn_n.c jacbase.c lshift.c mod_1.c mod_34lsub1.c mode1o.c \ + ior_n.c iorn_n.c jacbase.c lshift.c \ + matrix22_mul.c mod_1.c mod_34lsub1.c mode1o.c \ mul.c mul_1.c mul_2.c mul_3.c mul_4.c mul_fft.c mul_n.c mul_basecase.c \ mul_toom22.c mul_toom32.c mul_toom42.c \ mullow_n.c mullow_basecase.c nand_n.c nior_n.c perfsqr.c popcount.c \ @@ -72,5 +73,7 @@ mp_bases.c: perfsqr.h: cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/perfsqr.h +tune-gcd-p: gcd.c + $(COMPILE) -DTUNE_GCD_P=1 gcd.c -o tune-gcd-p -L ../.libs -lgmp include Makeasm.am diff --git a/mpn/alpha/ev5/gmp-mparam.h b/mpn/alpha/ev5/gmp-mparam.h index a58805781..9de9c07a2 100644 --- a/mpn/alpha/ev5/gmp-mparam.h +++ b/mpn/alpha/ev5/gmp-mparam.h @@ -41,10 +41,10 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 46 #define POWM_THRESHOLD 87 -#define HGCD_SCHOENHAGE_THRESHOLD 97 +#define HGCD_THRESHOLD 106 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 566 -#define GCDEXT_SCHOENHAGE_THRESHOLD 322 +#define GCD_DC_THRESHOLD 622 +#define GCDEXT_SCHOENHAGE_THRESHOLD 293 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ diff --git a/mpn/alpha/ev6/gmp-mparam.h b/mpn/alpha/ev6/gmp-mparam.h index 33ea80a54..f259a2278 100644 --- a/mpn/alpha/ev6/gmp-mparam.h +++ b/mpn/alpha/ev6/gmp-mparam.h @@ -42,9 +42,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 116 #define POWM_THRESHOLD 212 -#define HGCD_SCHOENHAGE_THRESHOLD 407 +#define HGCD_THRESHOLD 407 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 867 +#define GCD_DC_THRESHOLD 867 #define GCDEXT_SCHOENHAGE_THRESHOLD 867 #define JACOBI_BASE_METHOD 1 diff --git a/mpn/alpha/ev6/nails/gmp-mparam.h b/mpn/alpha/ev6/nails/gmp-mparam.h index 5d884e3bb..1bc93b52c 100644 --- a/mpn/alpha/ev6/nails/gmp-mparam.h +++ b/mpn/alpha/ev6/nails/gmp-mparam.h @@ -34,10 +34,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 48 #define POWM_THRESHOLD 113 -#define HGCD_SCHOENHAGE_THRESHOLD 78 +#define HGCD_THRESHOLD 78 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 392 -#define GCDEXT_THRESHOLD 0 /* always */ +#define GCD_DC_THRESHOLD 392 #define JACOBI_BASE_METHOD 1 #define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ diff --git a/mpn/alpha/gmp-mparam.h b/mpn/alpha/gmp-mparam.h index 138cc5438..37f700494 100644 --- a/mpn/alpha/gmp-mparam.h +++ b/mpn/alpha/gmp-mparam.h @@ -41,9 +41,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 38 #define POWM_THRESHOLD 53 -#define HGCD_SCHOENHAGE_THRESHOLD 63 +#define HGCD_THRESHOLD 63 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 476 +#define GCD_DC_THRESHOLD 476 #define GCDEXT_SCHOENHAGE_THRESHOLD 225 #define JACOBI_BASE_METHOD 2 diff --git a/mpn/arm/gmp-mparam.h b/mpn/arm/gmp-mparam.h index a142605fb..80b6ff8ee 100644 --- a/mpn/arm/gmp-mparam.h +++ b/mpn/arm/gmp-mparam.h @@ -37,7 +37,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define POWM_THRESHOLD 150 #define GCD_ACCEL_THRESHOLD 3 -#define GCDEXT_THRESHOLD 0 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ diff --git a/mpn/cray/gmp-mparam.h b/mpn/cray/gmp-mparam.h index b7da45c43..72dcb627d 100644 --- a/mpn/cray/gmp-mparam.h +++ b/mpn/cray/gmp-mparam.h @@ -41,10 +41,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 996 #define POWM_THRESHOLD 601 -#define HGCD_SCHOENHAGE_THRESHOLD 964 +#define HGCD_THRESHOLD 964 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 2874 -#define GCDEXT_THRESHOLD 6 +#define GCD_DC_THRESHOLD 2874 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ diff --git a/mpn/cray/ieee/gmp-mparam.h b/mpn/cray/ieee/gmp-mparam.h index d5a866000..03d655c81 100644 --- a/mpn/cray/ieee/gmp-mparam.h +++ b/mpn/cray/ieee/gmp-mparam.h @@ -34,10 +34,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 390 #define POWM_THRESHOLD 656 -#define HGCD_SCHOENHAGE_THRESHOLD 964 +#define HGCD_THRESHOLD 964 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 964 -#define GCDEXT_THRESHOLD 0 /* always */ +#define GCD_DC_THRESHOLD 964 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ diff --git a/mpn/generic/gcd.c b/mpn/generic/gcd.c index 30d6969a3..786c328f3 100644 --- a/mpn/generic/gcd.c +++ b/mpn/generic/gcd.c @@ -18,852 +18,255 @@ License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ -/* Integer greatest common divisor of two unsigned integers, using - the accelerated algorithm (see reference below). - - mp_size_t mpn_gcd (up, usize, vp, vsize). - - Preconditions [U = (up, usize) and V = (vp, vsize)]: - - 1. V is odd. - 2. numbits(U) >= numbits(V). - - Both U and V are destroyed by the operation. The result is left at vp, - and its size is returned. - - Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu) - - Funding for this work has been partially provided by Conselho Nacional - de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant - 301314194-2, and was done while I was a visiting reseacher in the Instituto - de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS). - - Refer to - K. Weber, The accelerated integer GCD algorithm, ACM Transactions on - Mathematical Software, v. 21 (March), 1995, pp. 111-122. */ - -#include <stdio.h> /* for NULL */ - #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" - -/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated - algorithm is used, otherwise the binary algorithm is used. This may be - adjusted for different architectures. */ -#ifndef GCD_ACCEL_THRESHOLD -#define GCD_ACCEL_THRESHOLD 5 -#endif - -/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated - algorithm reduces using the bmod operation. Otherwise, the k-ary reduction - is used. 0 <= BMOD_THRESHOLD < GMP_NUMB_BITS. */ -enum - { - BMOD_THRESHOLD = GMP_NUMB_BITS/2 - }; - - -/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2. - Both U and V must be odd. */ -static inline mp_size_t -gcd_2 (mp_ptr vp, mp_srcptr up) +static inline int +mpn_zero_p (mp_srcptr ap, mp_size_t n) { - mp_limb_t u0, u1, v0, v1; - mp_size_t vsize; - - u0 = up[0]; - u1 = up[1]; - v0 = vp[0]; - v1 = vp[1]; - - while (u1 != v1 && u0 != v0) + mp_size_t i; + for (i = n - 1; i >= 0; i--) { - unsigned long int r; - if (u1 > v1) - { - u1 -= v1 + (u0 < v0); - u0 = (u0 - v0) & GMP_NUMB_MASK; - count_trailing_zeros (r, u0); - u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r); - u1 >>= r; - } - else /* u1 < v1. */ - { - v1 -= u1 + (v0 < u0); - v0 = (v0 - u0) & GMP_NUMB_MASK; - count_trailing_zeros (r, v0); - v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r); - v1 >>= r; - } + if (ap[i] != 0) + return 0; } - - vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0); - - /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */ - if (u1 == v1 && u0 == v0) - return vsize; - - v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0; - vp[0] = mpn_gcd_1 (vp, vsize, v0); - return 1; } -/* The function find_a finds 0 < N < 2^GMP_NUMB_BITS such that there exists - 0 < |D| < 2^GMP_NUMB_BITS, and N == D * C mod 2^(2*GMP_NUMB_BITS). - In the reference article, D was computed along with N, but it is better to - compute D separately as D <-- N / C mod 2^(GMP_NUMB_BITS + 1), treating - the result as a twos' complement signed integer. - - Initialize N1 to C mod 2^(2*GMP_NUMB_BITS). According to the reference - article, N2 should be initialized to 2^(2*GMP_NUMB_BITS), but we use - 2^(2*GMP_NUMB_BITS) - N1 to start the calculations within double - precision. If N2 > N1 initially, the first iteration of the while loop - will swap them. In all other situations, N1 >= N2 is maintained. */ - -#if HAVE_NATIVE_mpn_gcd_finda -#define find_a(cp) mpn_gcd_finda (cp) +/* Uses the HGCD operation described in + + N. Möller, On Schönhage's algorithm and subquadratic integer gcd + computation, Math. Comp. 77 (2008), 589-607. + + to reduce inputs until they are of size below GCD_DC_THRESHOLD, and + then uses Lehmer's algorithm. +*/ + +/* Some reasonable choices are n / 2 (same as in hgcd), and p = (n + + * 2)/3, which gives a balanced multiplication in + * mpn_hgcd_matrix_adjust. However, p = 2 n/3 gives slightly better + * performance. The matrix-vector multiplication is then + * 4:1-unbalanced, with matrix elements of size n/6, and vector + * elements of size p = 2n/3. */ + +/* From analysis of the theoretical running time, it appears that when + * multiplication takes time O(n^alpha), p should be choosen so that + * the ratio of the time for the mpn_hgcd call, and the time for the + * multiplication in mpn_hgcd_matrix_adjust, is roughly 1/(alpha - + * 1). */ +#ifdef TUNE_GCD_P +#define P_TABLE_SIZE 10000 +mp_size_t p_table[P_TABLE_SIZE]; +#define CHOOSE_P(n) ( (n) < P_TABLE_SIZE ? p_table[n] : 2*(n)/3) #else -static -#if ! defined (__i386__) -inline /* don't inline this for the x86 */ +#define CHOOSE_P(n) (2*(n) / 3) #endif -mp_limb_t -find_a (mp_srcptr cp) -{ - unsigned long int leading_zero_bits = 0; - mp_limb_t n1_l = cp[0]; /* N1 == n1_h * 2^GMP_NUMB_BITS + n1_l. */ - mp_limb_t n1_h = cp[1]; - - mp_limb_t n2_l = (-n1_l & GMP_NUMB_MASK); /* N2 == n2_h * 2^GMP_NUMB_BITS + n2_l. */ - mp_limb_t n2_h = (~n1_h & GMP_NUMB_MASK); - - /* Main loop. */ - while (n2_h != 0) /* While N2 >= 2^GMP_NUMB_BITS. */ - { - /* N1 <-- N1 % N2. */ - if (((GMP_NUMB_HIGHBIT >> leading_zero_bits) & n2_h) == 0) - { - unsigned long int i; - count_leading_zeros (i, n2_h); - i -= GMP_NAIL_BITS; - i -= leading_zero_bits; - leading_zero_bits += i; - n2_h = ((n2_h << i) & GMP_NUMB_MASK) | (n2_l >> (GMP_NUMB_BITS - i)); - n2_l = (n2_l << i) & GMP_NUMB_MASK; - do - { - if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l)) - { - n1_h -= n2_h + (n1_l < n2_l); - n1_l = (n1_l - n2_l) & GMP_NUMB_MASK; - } - n2_l = (n2_l >> 1) | ((n2_h << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK); - n2_h >>= 1; - i -= 1; - } - while (i != 0); - } - if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l)) - { - n1_h -= n2_h + (n1_l < n2_l); - n1_l = (n1_l - n2_l) & GMP_NUMB_MASK; - } - - MP_LIMB_T_SWAP (n1_h, n2_h); - MP_LIMB_T_SWAP (n1_l, n2_l); - } - - return n2_l; -} -#endif - -/* v must be odd */ -static mp_size_t -gcd_binary_odd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) +mp_size_t +mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t n) { - mp_ptr orig_vp = vp; - mp_size_t orig_vsize = vsize; - int binary_gcd_ctr; /* Number of times binary gcd will execute. */ + mp_size_t talloc; + mp_size_t scratch; + mp_size_t matrix_scratch; + + mp_size_t gn; + mp_ptr tp; TMP_DECL; - ASSERT (usize >= 1); - ASSERT (vsize >= 1); - ASSERT (usize >= vsize); - ASSERT (vp[0] & 1); - ASSERT (up[usize - 1] != 0); - ASSERT (vp[vsize - 1] != 0); -#if WANT_ASSERT - if (usize == vsize) + /* FIXME: Check for small sizes first, before setting up temporary + storage etc. */ + talloc = MPN_GCD_LEHMER_N_ITCH(n); + + /* For initial division */ + scratch = usize - n + 1; + if (scratch > talloc) + talloc = scratch; + +#if TUNE_GCD_P + if (CHOOSE_P (n) > 0) +#else + if (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD)) +#endif { - int uzeros, vzeros; - count_leading_zeros (uzeros, up[usize - 1]); - count_leading_zeros (vzeros, vp[vsize - 1]); - ASSERT (uzeros <= vzeros); - } + mp_size_t hgcd_scratch; + mp_size_t update_scratch; + mp_size_t p = CHOOSE_P (n); + mp_size_t scratch; +#if TUNE_GCD_P + /* Worst case, since we don't guarantee that n - CHOOSE_P(n) + is increasing */ + matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n); + hgcd_scratch = mpn_hgcd_itch (n); + update_scratch = 2*(n - 1); +#else + matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p); + hgcd_scratch = mpn_hgcd_itch (n - p); + update_scratch = p + n - 1; #endif - ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize)); - ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, up, usize)); - ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, vp, vsize)); + scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); + if (scratch > talloc) + talloc = scratch; + } TMP_MARK; + tp = TMP_ALLOC_LIMBS(talloc); - /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD. - Two EXTRA limbs for U and V are required for kary reduction. */ - if (vsize >= GCD_ACCEL_THRESHOLD) + if (usize > n) { - unsigned long int vbitsize, d; - mp_ptr orig_up = up; - mp_size_t orig_usize = usize; - mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB); - - MPN_COPY (anchor_up, orig_up, usize); - up = anchor_up; - - count_leading_zeros (d, up[usize - 1]); - d -= GMP_NAIL_BITS; - d = usize * GMP_NUMB_BITS - d; - count_leading_zeros (vbitsize, vp[vsize - 1]); - vbitsize -= GMP_NAIL_BITS; - vbitsize = vsize * GMP_NUMB_BITS - vbitsize; - ASSERT (d >= vbitsize); - d = d - vbitsize + 1; - - /* Use bmod reduction to quickly discover whether V divides U. */ - up[usize++] = 0; /* Insert leading zero. */ - mpn_bdivmod (up, up, usize, vp, vsize, d); - - /* Now skip U/V mod 2^d and any low zero limbs. */ - d /= GMP_NUMB_BITS, up += d, usize -= d; - while (usize != 0 && up[0] == 0) - up++, usize--; - - if (usize == 0) /* GCD == ORIG_V. */ - goto done; - - vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB); - MPN_COPY (vp, orig_vp, vsize); - - do /* Main loop. */ - { - /* mpn_com_n can't be used here because anchor_up and up may - partially overlap */ - if ((up[usize - 1] & GMP_NUMB_HIGHBIT) != 0) /* U < 0; take twos' compl. */ - { - mp_size_t i; - anchor_up[0] = -up[0] & GMP_NUMB_MASK; - for (i = 1; i < usize; i++) - anchor_up[i] = (~up[i] & GMP_NUMB_MASK); - up = anchor_up; - } - - MPN_NORMALIZE_NOT_ZERO (up, usize); - - if ((up[0] & 1) == 0) /* Result even; remove twos. */ - { - unsigned int r; - count_trailing_zeros (r, up[0]); - mpn_rshift (anchor_up, up, usize, r); - usize -= (anchor_up[usize - 1] == 0); - } - else if (anchor_up != up) - MPN_COPY_INCR (anchor_up, up, usize); - - MPN_PTR_SWAP (anchor_up,usize, vp,vsize); - up = anchor_up; - - if (vsize <= 2) /* Kary can't handle < 2 limbs and */ - break; /* isn't efficient for == 2 limbs. */ + mpn_tdiv_qr (tp, up, 0, up, usize, vp, n); - d = vbitsize; - count_leading_zeros (vbitsize, vp[vsize - 1]); - vbitsize -= GMP_NAIL_BITS; - vbitsize = vsize * GMP_NUMB_BITS - vbitsize; - d = d - vbitsize + 1; - - if (d > BMOD_THRESHOLD) /* Bmod reduction. */ - { - up[usize++] = 0; - mpn_bdivmod (up, up, usize, vp, vsize, d); - d /= GMP_NUMB_BITS, up += d, usize -= d; - } - else /* Kary reduction. */ - { - mp_limb_t bp[2], cp[2]; - - /* C <-- V/U mod 2^(2*GMP_NUMB_BITS). */ - { - mp_limb_t u_inv, hi, lo; - modlimb_invert (u_inv, up[0]); - cp[0] = (vp[0] * u_inv) & GMP_NUMB_MASK; - umul_ppmm (hi, lo, cp[0], up[0] << GMP_NAIL_BITS); - lo >>= GMP_NAIL_BITS; - cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv & GMP_NUMB_MASK; - } - - /* U <-- find_a (C) * U. */ - up[usize] = mpn_mul_1 (up, up, usize, find_a (cp)); - usize++; - - /* B <-- A/C == U/V mod 2^(GMP_NUMB_BITS + 1). - bp[0] <-- U/V mod 2^GMP_NUMB_BITS and - bp[1] <-- ( (U - bp[0] * V)/2^GMP_NUMB_BITS ) / V mod 2 - - Like V/U above, but simplified because only the low bit of - bp[1] is wanted. */ - { - mp_limb_t v_inv, hi, lo; - modlimb_invert (v_inv, vp[0]); - bp[0] = (up[0] * v_inv) & GMP_NUMB_MASK; - umul_ppmm (hi, lo, bp[0], vp[0] << GMP_NAIL_BITS); - lo >>= GMP_NAIL_BITS; - bp[1] = (up[1] + hi + (bp[0] & vp[1])) & 1; - } - - up[usize++] = 0; - if (bp[1] != 0) /* B < 0: U <-- U + (-B) * V. */ - { - mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0] & GMP_NUMB_MASK); - mpn_add_1 (up + vsize, up + vsize, usize - vsize, c); - } - else /* B >= 0: U <-- U - B * V. */ - { - mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]); - mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); - } - - up += 2, usize -= 2; /* At least two low limbs are zero. */ - } - - /* Must remove low zero limbs before complementing. */ - while (usize != 0 && up[0] == 0) - up++, usize--; + if (mpn_zero_p (up, n)) + { + MPN_COPY (gp, vp, n); + TMP_FREE; + return n; } - while (usize != 0); - - /* Compute GCD (ORIG_V, GCD (ORIG_U, V)). Binary will execute twice. */ - up = orig_up, usize = orig_usize; - binary_gcd_ctr = 2; } - else - binary_gcd_ctr = 1; - /* Finish up with the binary algorithm. Executes once or twice. */ - for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize) +#if TUNE_GCD_P + while (CHOOSE_P (n) > 0) +#else + while (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD)) +#endif { - if (usize > 2) /* First make U close to V in size. */ + struct hgcd_matrix M; + mp_size_t p = CHOOSE_P (n); + mp_size_t matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p); + mp_size_t nn; + mpn_hgcd_matrix_init (&M, n - p, tp); + nn = mpn_hgcd (up + p, vp + p, n - p, &M, tp + matrix_scratch); + if (nn > 0) { - unsigned long int vbitsize, d; - count_leading_zeros (d, up[usize - 1]); - d -= GMP_NAIL_BITS; - d = usize * GMP_NUMB_BITS - d; - count_leading_zeros (vbitsize, vp[vsize - 1]); - vbitsize -= GMP_NAIL_BITS; - vbitsize = vsize * GMP_NUMB_BITS - vbitsize; - d = d - vbitsize - 1; - if (d != -(unsigned long int)1 && d > 2) - { - mpn_bdivmod (up, up, usize, vp, vsize, d); /* Result > 0. */ - d /= (unsigned long int)GMP_NUMB_BITS, up += d, usize -= d; - } + ASSERT (M.n <= (n - p - 1)/2); + ASSERT (M.n + p <= (p + n - 1) / 2); + /* Temporary storage 2 (p + M->n) <= p + n - 1. */ + n = mpn_hgcd_matrix_adjust (&M, p + nn, up, vp, p, tp + matrix_scratch); } - - /* Start binary GCD. */ - do + else { - mp_size_t zeros; - - /* Make sure U is odd. */ - MPN_NORMALIZE (up, usize); - while (up[0] == 0) - up += 1, usize -= 1; - if ((up[0] & 1) == 0) - { - unsigned int r; - count_trailing_zeros (r, up[0]); - mpn_rshift (up, up, usize, r); - usize -= (up[usize - 1] == 0); - } - - /* Keep usize >= vsize. */ - if (usize < vsize) - MPN_PTR_SWAP (up, usize, vp, vsize); - - if (usize <= 2) /* Double precision. */ - { - if (vsize == 1) - vp[0] = mpn_gcd_1 (up, usize, vp[0]); - else - vsize = gcd_2 (vp, up); - break; /* Binary GCD done. */ - } - - /* Count number of low zero limbs of U - V. */ - for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; ) - continue; - - /* If U < V, swap U and V; in any case, subtract V from U. */ - if (zeros == vsize) /* Subtract done. */ - up += zeros, usize -= zeros; - else if (usize == vsize) + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (gp, &gn, up, vp, n, tp); + if (n == 0) { - mp_size_t size = vsize; - do - size--; - while (up[size] == vp[size]); - if (up[size] < vp[size]) /* usize == vsize. */ - MP_PTR_SWAP (up, vp); - up += zeros, usize = size + 1 - zeros; - mpn_sub_n (up, up, vp + zeros, usize); - } - else - { - mp_size_t size = vsize - zeros; - up += zeros, usize -= zeros; - if (mpn_sub_n (up, up, vp + zeros, size)) - { - while (up[size] == 0) /* Propagate borrow. */ - up[size++] = -(mp_limb_t)1; - up[size] -= 1; - } + TMP_FREE; + return gn; } } - while (usize); /* End binary GCD. */ } -done: - if (vp != gp) - MPN_COPY_INCR (gp, vp, vsize); + gn = mpn_gcd_lehmer_n (gp, up, vp, n, tp); TMP_FREE; - return vsize; + return gn; } -#define EVEN_P(x) (((x) & 1) == 0) - -/* Allows an even v */ -static mp_size_t -gcd_binary (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) +#ifdef TUNE_GCD_P +#include <stdio.h> +#include <string.h> +#include <time.h> + +#define TIME(res, code) do { \ + clock_t time_start; \ + clock_t time_end; \ + clock_t time_end_time; \ + unsigned time_iter = 0; \ + \ + time_start = clock(); \ + time_end_time = time_start + CLOCKS_PER_SEC / 100; \ + do \ + { \ + code; \ + time_end = clock(); \ + time_iter++; \ + } \ + while (time_end <= time_end_time); \ + \ + (res) = (double) (time_end - time_start) / (CLOCKS_PER_SEC * time_iter); \ + } while (0) + +int +main(int argc, char *argv) { - mp_size_t zero_words = 0; - mp_size_t gsize; - unsigned shift = 0; - - ASSERT (usize > 0); - ASSERT (vsize > 0); + gmp_randstate_t rands; + mp_size_t n; + mp_ptr ap; + mp_ptr bp; + mp_ptr up; + mp_ptr vp; + mp_ptr gp; + mp_ptr tp; + TMP_DECL; - if (up[0] == 0 && vp[0] == 0) - { - do - gp[zero_words++] = 0; - while (up[zero_words] == 0 && vp[zero_words] == 0); + /* Unbuffered so if output is redirected to a file it isn't lost if the + program is killed part way through. */ + setbuf (stdout, NULL); + setbuf (stderr, NULL); - up += zero_words; usize -= zero_words; - vp += zero_words; vsize -= zero_words; - gp += zero_words; - } + gmp_randinit_default (rands); - /* Now u and v can have a common power of two < 2^GMP_NUMB_BITS */ - if (up[0] == 0) - { - ASSERT (vp[0] != 0); - if (EVEN_P (vp[0])) - { - count_trailing_zeros (shift, vp[0]); - ASSERT (shift > 0); - ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, shift)); - if (vp[vsize - 1] == 0) - vsize--; - } - } - else if (vp[0] == 0) - { - if (EVEN_P (up[0])) - { - count_trailing_zeros (shift, up[0]); - ASSERT (shift > 0); - } - while (vp[0] == 0) - { - vp++; - vsize--; - } - - if (EVEN_P (vp[0])) - { - unsigned vcount; - - count_trailing_zeros (vcount, vp[0]); - ASSERT (vcount > 0); - ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, vcount)); - if (vp[vsize - 1] == 0) - vsize--; - } - } - else if (EVEN_P (vp[0])) - { - unsigned vcount; - count_trailing_zeros (vcount, vp[0]); - ASSERT (vcount > 0); - ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, vcount)); - if (vp[vsize - 1] == 0) - vsize--; - - if (EVEN_P (up[0])) - { - unsigned ucount; - count_trailing_zeros (ucount, up[0]); - ASSERT (ucount > 0); - shift = MIN (ucount, vcount); - } - } + TMP_MARK; - gsize = gcd_binary_odd (gp, up, usize, vp, vsize); - if (shift) + ap = TMP_ALLOC_LIMBS (P_TABLE_SIZE); + bp = TMP_ALLOC_LIMBS (P_TABLE_SIZE); + up = TMP_ALLOC_LIMBS (P_TABLE_SIZE); + vp = TMP_ALLOC_LIMBS (P_TABLE_SIZE); + gp = TMP_ALLOC_LIMBS (P_TABLE_SIZE); + tp = TMP_ALLOC_LIMBS (MPN_GCD_LEHMER_N_ITCH (P_TABLE_SIZE)); + + mpn_random (ap, P_TABLE_SIZE); + mpn_random (bp, P_TABLE_SIZE); + + memset (p_table, 0, sizeof(p_table)); + + for (n = 10; n++; n < P_TABLE_SIZE) { - mp_limb_t cy = mpn_lshift (gp, gp, gsize, shift); - if (cy) - gp[gsize++] = cy; - } - return gsize + zero_words; -} - -#define MPN_LEQ_P(ap, asize, bp, bsize) \ -((asize) < (bsize) || ((asize) == (bsize) \ - && mpn_cmp ((ap), (bp), (asize)) <= 0)) - -/* Sets (a, b, c, d) <-- (c, d, a, b) */ -#define NHGCD_SWAP4_2(row) \ -do { \ - struct hgcd_row __nhgcd_swap4_2_tmp; \ - __nhgcd_swap4_2_tmp = row[0]; \ - row[0] = row[2]; \ - row[2] = __nhgcd_swap4_2_tmp; \ - __nhgcd_swap4_2_tmp = row[1]; \ - row[1] = row[3]; \ - row[3] = __nhgcd_swap4_2_tmp; \ -} while (0) - -/* Sets (a, b, c) <-- (b, c, a) */ -#define NHGCD_SWAP3_LEFT(row) \ -do { \ - struct hgcd_row __nhgcd_swap4_left_tmp; \ - __nhgcd_swap4_left_tmp = row[0]; \ - row[0] = row[1]; \ - row[1] = row[2]; \ - row[2] = __nhgcd_swap4_left_tmp; \ -} while (0) - -static mp_size_t -hgcd_tdiv (mp_ptr qp, - mp_ptr rp, mp_size_t *rsizep, - mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize) -{ - mp_size_t qsize; - mp_size_t rsize; + mp_size_t p; + mp_size_t best_p; + double best_time; + double lehmer_time; - mpn_tdiv_qr (qp, rp, 0, ap, asize, bp, bsize); + if (ap[n-1] == 0) + ap[n-1] = 1; - rsize = bsize; - MPN_NORMALIZE (rp, rsize); - *rsizep = rsize; + if (bp[n-1] == 0) + bp[n-1] = 1; - qsize = asize - bsize + 1; - qsize -= (qp[qsize - 1] == 0); + p_table[n] = 0; + TIME(lehmer_time, { + MPN_COPY (up, ap, n); + MPN_COPY (vp, bp, n); + mpn_gcd_lehmer_n (gp, up, vp, n, tp); + }); - if (qsize == 1 && qp[0] == 1) - return 0; - - return qsize; -} - - -#if 0 -#define GCD_LEHMER_ITCH(asize) (5*((asize) + 1)) - -static mp_size_t -gcd_lehmer (mp_ptr gp, mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize, - mp_ptr tp, mp_size_t talloc) -{ - struct hgcd_row r[4]; - mp_ptr qp; - mp_size_t qsize; - mp_size_t ralloc = asize + 1; + best_time = lehmer_time; + best_p = 0; - ASSERT (asize >= bsize); - ASSERT (bsize > 0); - -#if 0 - if (BELOW_THRESHOLD (asize, MPN_GCD_LEHMER_THRESHOLD)) - { - ASSERT (asize + bsize + 2 <= talloc); - - MPN_COPY (tp, ap, asize); - MPN_COPY (tp + asize + 1, bp, bsize); - return nhgcd_gcd_binary (gp, tp, asize, tp + asize + 1, bsize); - } -#endif - - ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); - ASSERT (5 * asize + 4 <= talloc); - - r[0].rp = tp; tp += ralloc; talloc -= ralloc; - r[1].rp = tp; tp += ralloc; talloc -= ralloc; - r[2].rp = tp; tp += ralloc; talloc -= ralloc; - r[3].rp = tp; tp += ralloc; talloc -= ralloc; - qp = tp; tp += asize; talloc -= asize; - - MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize; - MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize; - -#if 0 - /* u and v fields aren't used, but zero them out so that we can call - trace_nhgcd_row */ - r[0].uvp[0] = r[0].uvp[1] = NULL; - r[1].uvp[0] = r[1].uvp[1] = NULL; - r[2].uvp[0] = r[2].uvp[1] = NULL; - r[3].uvp[0] = r[3].uvp[1] = NULL; -#endif - - while (ABOVE_THRESHOLD (r[0].rsize, GCD_LEHMER_THRESHOLD) && r[1].rsize > 0) - { - struct hgcd2 hgcd; - int res = mpn_hgcd2_lehmer_step (&hgcd, - r[0].rp, r[0].rsize, - r[1].rp, r[1].rsize, - NULL); - - if (!res || (res == 2 && hgcd.row[0].v == 0)) + for (p = 1; p < n; p += (n+9)/10) { - qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize, - r[0].rp, r[0].rsize, - r[1].rp, r[1].rsize); - NHGCD_SWAP3_LEFT (r); - } - else - { - const struct hgcd2_row *s = hgcd.row + (res - 2); - int sign = hgcd.sign; - if (res == 3) - sign = ~sign; - - /* s[0] and s[1] correct. */ - r[2].rsize - = mpn_hgcd2_fix (r[2].rp, ralloc, - sign, - s[0].u, r[0].rp, r[0].rsize, - s[0].v, r[1].rp, r[1].rsize); - - r[3].rsize - = mpn_hgcd2_fix (r[3].rp, ralloc, - ~sign, - s[1].u, r[0].rp, r[0].rsize, - s[1].v, r[1].rp, r[1].rsize); - - NHGCD_SWAP4_2 (r); - } - } - - if (r[1].rsize == 0) - { - MPN_COPY (gp, r[0].rp, r[0].rsize); - return r[0].rsize; - } + double t; - return gcd_binary (gp, r[0].rp, r[0].rsize, r[1].rp, r[1].rsize); -} -#endif - -static mp_size_t -gcd_schoenhage_itch (mp_size_t asize) -{ - /* Size for hgcd calls */ - mp_size_t ralloc = asize + 1; - mp_size_t hgcd_size = (asize + 1) / 2; - return (4 * ralloc /* Remainder storage */ - + mpn_hgcd_init_itch (hgcd_size) /* hgcd storage */ - + qstack_itch (hgcd_size) - + mpn_hgcd_itch (hgcd_size) /* nhgcd call */ - + 1+ 3 * asize / 4); /* hgcd_fix */ -} + p_table[n] = p; + TIME(t, { + MPN_COPY (up, ap, n); + MPN_COPY (vp, bp, n); + mpn_gcd (gp, up, n, vp, n); + }); -static mp_size_t -gcd_schoenhage (mp_ptr gp, mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize, - mp_ptr tp, mp_size_t talloc) -{ - mp_size_t scratch; - struct hgcd hgcd; - struct qstack quotients; - struct hgcd_row r[4]; - - mp_size_t ralloc = asize + 1; - - ASSERT (asize >= bsize); - ASSERT (bsize > 0); - - ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); - - ASSERT (4 * ralloc <= talloc); - tp += ralloc; talloc -= ralloc; - r[0].rp = tp; tp += ralloc; talloc -= ralloc; - r[1].rp = tp; tp += ralloc; talloc -= ralloc; - r[2].rp = tp; tp += ralloc; talloc -= ralloc; - r[3].rp = tp; tp += ralloc; talloc -= ralloc; - - MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize; - MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize; - -#if 0 - /* We don't use the u and v fields, but zero them out so that we can - call trace_nhgcd_row while debugging. */ - r[0].uvp[0] = r[0].uvp[1] = NULL; - r[1].uvp[0] = r[1].uvp[1] = NULL; - r[2].uvp[0] = r[2].uvp[1] = NULL; - r[3].uvp[0] = r[3].uvp[1] = NULL; -#endif - - scratch = mpn_hgcd_init_itch ((asize + 1)/2); - ASSERT (scratch <= talloc); - mpn_hgcd_init (&hgcd, (asize + 1)/2, tp); - tp += scratch; talloc -= scratch; - - { - mp_size_t nlimbs = qstack_itch ((asize + 1)/2); - - ASSERT (nlimbs <= talloc); - - qstack_init ("ients, (asize + 1) / 2, tp, nlimbs); - - tp += nlimbs; - talloc -= nlimbs; - } - - while (ABOVE_THRESHOLD (r[0].rsize, GCD_SCHOENHAGE_THRESHOLD) - && r[1].rsize > 0) - { - mp_size_t k = r[0].rsize / 2; - int res; - -#if 0 - trace ("nhgcd_gcd_schoenhage\n"); - trace_nhgcd_row (r); - trace_nhgcd_row (r + 1); -#endif - if (r[1].rsize <= k) - goto euclid; - - qstack_reset ("ients, r[0].rsize - k); - - res = mpn_hgcd (&hgcd, - r[0].rp + k, r[0].rsize - k, - r[1].rp + k, r[1].rsize - k, - "ients, - tp, talloc); - - if (res == 0 || res == 1) - { - euclid: - ASSERT (r[0].rsize - r[1].rsize + 1 <= talloc); - hgcd_tdiv (tp, r[2].rp, &r[2].rsize, - r[0].rp, r[0].rsize, - r[1].rp, r[1].rsize); - - NHGCD_SWAP3_LEFT (r); - } - else - { - const struct hgcd_row *s = hgcd.row + (res - 2); - int sign = hgcd.sign; - if (res == 3) - sign = ~sign; - - /* s[0] and s[1] are correct */ - r[2].rsize - = mpn_hgcd_fix (k, r[2].rp, ralloc, - sign, hgcd.size, s, - r[0].rp, r[1].rp, - tp, talloc); - - r[3].rsize - = mpn_hgcd_fix (k, r[3].rp, ralloc, - ~sign, hgcd.size, s+1, - r[0].rp, r[1].rp, - tp, talloc); - - NHGCD_SWAP4_2 (r); + if (t < best_time) + { + best_time = t; + best_p = p; + } } - } + printf("%6d %6d %5.3g", n, best_p, (double) best_p / n); + if (best_p > 0) + printf(" %5.3g%%", 100 * (lehmer_time - best_time) / lehmer_time); + printf("\n"); -#if 0 - trace ("nhgcd_gcd_schoenhage after loop\n"); - trace_nhgcd_row (r); - trace_nhgcd_row (r + 1); -#endif - - if (r[1].rsize == 0) - { - MPN_COPY (gp, r[0].rp, r[0].rsize); - return r[0].rsize; - } -#if 0 - else if (ABOVE_THRESHOLD (r[0].rsize, GCD_LEHMER_THRESHOLD)) - return gcd_lehmer (gp, - r[0].rp, r[0].rsize, - r[1].rp, r[1].rsize, - tp, talloc); -#endif - else - return gcd_binary (gp, - r[0].rp, r[0].rsize, - r[1].rp, r[1].rsize); -} - -/* Should we perform an initial division? */ -mp_size_t -mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) -{ - if (BELOW_THRESHOLD (usize, GCD_SCHOENHAGE_THRESHOLD)) - return gcd_binary_odd (gp, up, usize, vp, vsize); - - /* The algorithms below require U >= V, while mpn_gcd is long documented as - requiring only that the position of U's msb >= V's msb. */ - if (usize == vsize && mpn_cmp (up, vp, usize) < 0) - MP_PTR_SWAP (up, vp); - -#if 0 - if (BELOW_THRESHOLD (usize, GCD_SCHOENHAGE_THRESHOLD)) - { - mp_size_t scratch; - mp_ptr tp; - mp_size_t gsize; - TMP_DECL; - - TMP_MARK; - - scratch = GCD_LEHMER_ITCH (usize); - tp = TMP_ALLOC_LIMBS (scratch); - - gsize = gcd_lehmer (gp, up, usize, vp, vsize, tp, scratch); - TMP_FREE; - return gsize; - } - else -#endif - { - mp_size_t scratch; - mp_ptr tp; - mp_size_t gsize; - - scratch = gcd_schoenhage_itch (usize); - tp = __GMP_ALLOCATE_FUNC_LIMBS (scratch); - - gsize = gcd_schoenhage (gp, up, usize, vp, vsize, tp, scratch); - __GMP_FREE_FUNC_LIMBS (tp, scratch); - return gsize; + p_table[n] = best_p; } + TMP_FREE; + gmp_randclear(rands); + return 0; } +#endif /* TUNE_GCD_P */ diff --git a/mpn/generic/gcd_lehmer.c b/mpn/generic/gcd_lehmer.c new file mode 100644 index 000000000..42a7ddefc --- /dev/null +++ b/mpn/generic/gcd_lehmer.c @@ -0,0 +1,161 @@ +/* gcd_lehmer.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Use binary algorithm to compute G <-- GCD (U, V) for usize, vsize == 2. + Both U and V must be odd. */ +static inline mp_size_t +gcd_2 (mp_ptr gp, mp_srcptr up, mp_srcptr vp) +{ + mp_limb_t u0, u1, v0, v1; + mp_size_t gn; + + u0 = up[0]; + u1 = up[1]; + v0 = vp[0]; + v1 = vp[1]; + + ASSERT (u0 & 1); + ASSERT (v0 & 1); + + /* Check for u0 != v0 needed to ensure that argument to + * count_trailing_zeros is non-zero. */ + while (u1 != v1 && u0 != v0) + { + unsigned long int r; + if (u1 > v1) + { + u1 -= v1 + (u0 < v0); + u0 = (u0 - v0) & GMP_NUMB_MASK; + count_trailing_zeros (r, u0); + u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r); + u1 >>= r; + } + else /* u1 < v1. */ + { + v1 -= u1 + (v0 < u0); + v0 = (v0 - u0) & GMP_NUMB_MASK; + count_trailing_zeros (r, v0); + v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r); + v1 >>= r; + } + } + + gp[0] = u0, gp[1] = u1, gn = 1 + (u1 != 0); + + /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */ + if (u1 == v1 && u0 == v0) + return gn; + + v0 = (u0 == v0) ? ((u1 > v1) ? u1-v1 : v1-u1) : ((u0 > v0) ? u0-v0 : v0-u0); + gp[0] = mpn_gcd_1 (gp, gn, v0); + + return 1; +} + +/* Temporary storage: n */ +mp_size_t +mpn_gcd_lehmer_n (mp_ptr gp, mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) +{ + mp_size_t scratch; + + /* Relax this requirement, and normalize at the start? Must disallow + A = B = 0, though. */ + ASSERT(ap[n-1] > 0 || bp[n-1] > 0); + + while (n > 2) + { + struct hgcd_matrix1 M; + mp_limb_t ah, al, bh, bl; + mp_limb_t mask; + + mask = ap[n-1] | bp[n-1]; + ASSERT (mask > 0); + + if (mask & GMP_NUMB_HIGHBIT) + { + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); + al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); + bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); + bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); + } + + /* Try an mpn_nhgcd2 step */ + if (mpn_hgcd2 (ah, al, bh, bl, &M)) + /* Temporary storage n */ + n = mpn_hgcd_mul_matrix1_inverse_vector (&M, n, ap, bp, tp); + + else + { + /* mpn_hgcd2 has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + mp_size_t gn; + + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (gp, &gn, ap, bp, n, tp); + if (n == 0) + return gn; + } + } + + if (n == 1) + { + *gp = mpn_gcd_1(ap, 1, bp[0]); + return 1; + } + + /* Due to the calling convention for mpn_gcd, at most one can be + even. */ + + if (! (ap[0] & 1)) + MP_PTR_SWAP (ap, bp); + + ASSERT (ap[0] & 1); + + if (bp[0] == 0) + { + *gp = mpn_gcd_1 (ap, 2, bp[1]); + return 1; + } + else if (! (bp[0] & 1)) + { + int r; + count_trailing_zeros (r, bp[0]); + bp[0] = ((bp[1] << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (bp[0] >> r); + bp[1] >>= r; + } + + return gcd_2(gp, ap, bp); +} diff --git a/mpn/generic/gcd_subdiv_step.c b/mpn/generic/gcd_subdiv_step.c new file mode 100644 index 000000000..d9708e8e1 --- /dev/null +++ b/mpn/generic/gcd_subdiv_step.c @@ -0,0 +1,116 @@ +/* gcd_subdiv_step.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +static inline int +mpn_zero_p (mp_srcptr ap, mp_size_t n) +{ + mp_size_t i; + for (i = n - 1; i >= 0; i--) + { + if (ap[i] != 0) + return 0; + } + return 1; +} + +/* Used when mpn_hgcd or mpn_hgcd2 has failed. Then either one of a or + b is small, or the difference is small. Perform one subtraction + followed by one division. If the gcd is found, stores it in gp and + *gn, and returns zero. Otherwise, compute the reduced a and b, and + return the new size. */ + +/* FIXME: Check when the smaller number is a single limb, and invoke + * mpn_gcd_1. */ +mp_size_t +mpn_gcd_subdiv_step (mp_ptr gp, mp_size_t *gn, + mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) +{ + mp_size_t an, bn; + + ASSERT (n > 0); + ASSERT (ap[n-1] > 0 || bp[n-1] > 0); + + an = bn = n; + MPN_NORMALIZE (ap, an); + MPN_NORMALIZE (bp, bn); + + if (UNLIKELY (an == 0)) + { + return_b: + MPN_COPY (gp, bp, bn); + *gn = bn; + return 0; + } + else if (UNLIKELY (bn == 0)) + { + return_a: + MPN_COPY (gp, ap, an); + *gn = an; + return 0; + } + + /* Arrange so that a > b, subtract an -= bn, and maintain + normalization. */ + if (an < bn) + MPN_PTR_SWAP (ap, an, bp, bn); + else if (an == bn) + { + int c; + MPN_CMP (c, ap, bp, an); + if (UNLIKELY (c == 0)) + goto return_a; + else if (c < 0) + MP_PTR_SWAP (ap, bp); + } + + ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn)); + MPN_NORMALIZE (ap, an); + ASSERT (an > 0); + + /* Arrange so that a > b, and divide a = q b + r */ + /* FIXME: an < bn happens when we have cancellation. If that is the + common case, then we could reverse the roles of a and b to avoid + the swap. */ + if (an < bn) + MPN_PTR_SWAP (ap, an, bp, bn); + else if (an == bn) + { + int c; + MPN_CMP (c, ap, bp, an); + if (UNLIKELY (c == 0)) + goto return_a; + else if (c < 0) + MP_PTR_SWAP (ap, bp); + } + + mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn); + + if (mpn_zero_p (ap, bn)) + goto return_b; + + return bn; +} diff --git a/mpn/generic/gcdext.c b/mpn/generic/gcdext.c index 63528f98e..94d490791 100644 --- a/mpn/generic/gcdext.c +++ b/mpn/generic/gcdext.c @@ -18,819 +18,101 @@ License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ -#define WANT_TRACE 0 - -/* Default to binary gcdext_1, since it is best on most current machines. - We should teach tuneup to choose the right gcdext_1. */ -#define GCDEXT_1_USE_BINARY 1 - -#if WANT_TRACE -# include <stdio.h> -# include <stdarg.h> -#endif - #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" -#ifndef NULL -# define NULL ((void *) 0) -#endif - -#if WANT_TRACE -static void -trace (const char *format, ...) -{ - va_list args; - va_start (args, format); - gmp_vfprintf (stderr, format, args); - va_end (args); -} -#endif - -/* Comparison of _normalized_ numbers. */ - -#define MPN_EQUAL_P(ap, asize, bp, bsize) \ -((asize) == (bsize) && mpn_cmp ((ap), (bp), (asize)) == 0) - -#define MPN_LEQ_P(ap, asize, bp, bsize) \ -((asize) < (bsize) || ((asize) == (bsize) \ - && mpn_cmp ((ap), (bp), (asize)) <= 0)) - -/* Returns g, u and v such that g = u A - v B. There are three - different cases for the result: - - g = u A - v B, 0 < u < b, 0 < v < a - g = A u = 1, v = 0 - g = B u = B, v = A - 1 - - We always return with 0 < u <= b, 0 <= v < a. -*/ -#if GCDEXT_1_USE_BINARY - -static mp_limb_t -gcdext_1_odd (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) +static inline int +mpn_zero_p (mp_srcptr ap, mp_size_t n) { - mp_limb_t u0; - mp_limb_t v0; - mp_limb_t v1; - mp_limb_t u1; - - mp_limb_t B = b; - mp_limb_t A = a; - - /* Through out this function maintain - - a = u0 A - v0 B - b = u1 A - v1 B - - where A and B are odd. */ - - u0 = 1; v0 = 0; - u1 = b; v1 = a-1; - - if (A == 1) - { - *up = u0; *vp = v0; - return 1; - } - else if (B == 1) - { - *up = u1; *vp = v1; - return 1; - } - - while (a != b) - { - mp_limb_t mask; - - ASSERT (a % 2 == 1); - ASSERT (b % 2 == 1); - - ASSERT (0 < u0); ASSERT (u0 <= B); - ASSERT (0 < u1); ASSERT (u1 <= B); - - ASSERT (0 <= v0); ASSERT (v0 < A); - ASSERT (0 <= v1); ASSERT (v1 < A); - - if (a > b) - { - MP_LIMB_T_SWAP (a, b); - MP_LIMB_T_SWAP (u0, u1); - MP_LIMB_T_SWAP (v0, v1); - } - - ASSERT (a < b); - - /* Makes b even */ - b -= a; - - mask = - (mp_limb_t) (u1 < u0); - u1 += B & mask; - v1 += A & mask; - u1 -= u0; - v1 -= v0; - - ASSERT (b % 2 == 0); - - do - { - /* As b = u1 A + v1 B is even, while A and B are odd, - either both or none of u1, v1 is even */ - - ASSERT (u1 % 2 == v1 % 2); - - mask = -(u1 & 1); - u1 = u1 / 2 + ((B / 2) & mask) - mask; - v1 = v1 / 2 + ((A / 2) & mask) - mask; - - b /= 2; - } - while (b % 2 == 0); - } - - /* Now g = a = b */ - ASSERT (a == b); - ASSERT (u1 <= B); - ASSERT (v1 < A); - - ASSERT (A % a == 0); - ASSERT (B % a == 0); - ASSERT (u0 % (B/a) == u1 % (B/a)); - ASSERT (v0 % (A/a) == v1 % (A/a)); - - *up = u0; *vp = v0; - - return a; -} - -static mp_limb_t -gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) -{ - unsigned shift = 0; - mp_limb_t g; - mp_limb_t u; - mp_limb_t v; - - /* We use unsigned values in the range 0, ... B - 1. As the values - are uniquely determined only modulo B, we can add B at will, to - get numbers in range or flip the least significant bit. */ - /* Deal with powers of two */ - while ((a | b) % 2 == 0) - { - a /= 2; b /= 2; shift++; - } - - if (b % 2 == 0) - { - unsigned k = 0; - - do { - b /= 2; k++; - } while (b % 2 == 0); - - g = gcdext_1_odd (&u, &v, a, b); - - while (k--) - { - /* We have g = u a + v b, and need to construct - g = u'a + v'(2b). - - If v is even, we can just set u' = u, v' = v/2 - If v is odd, we can set v' = (v + a)/2, u' = u + b - */ - - if (v % 2 == 0) - v /= 2; - else - { - u = u + b; - v = v/2 + a/2 + 1; - } - b *= 2; - } - } - else if (a % 2 == 0) - { - unsigned k = 0; - - do { - a /= 2; k++; - } while (a % 2 == 0); - - g = gcdext_1_odd (&u, &v, a, b); - - while (k--) - { - /* We have g = u a + v b, and need to construct - g = u'(2a) + v'b. - - If u is even, we can just set u' = u/2, v' = v. - If u is odd, we can set u' = (u + b)/2 - */ - - if (u % 2 == 0) - u /= 2; - else - { - u = u/2 + b/2 + 1; - v = v + a; - } - a *= 2; - } - } - else - /* Ok, both are odd */ - g = gcdext_1_odd (&u, &v, a, b); - - *up = u; - *vp = v; - - return g << shift; -} - -#else /* ! GCDEXT_1_USE_BINARY */ -static mp_limb_t -gcdext_1_u (mp_limb_t *up, mp_limb_t a, mp_limb_t b) -{ - /* Maintain - - a = u0 A mod B - b = - u1 A mod B - */ - mp_limb_t u0 = 1; - mp_limb_t u1 = 0; - mp_limb_t B = b; - - ASSERT (a >= b); - ASSERT (b > 0); - - for (;;) + mp_size_t i; + for (i = n - 1; i >= 0; i--) { - mp_limb_t q; - - q = a / b; - a -= q * b; - - if (a == 0) - { - *up = B - u1; - return b; - } - u0 += q * u1; - - q = b / a; - b -= q * a; - - if (b == 0) - { - *up = u0; - return a; - } - u1 += q * u0; + if (ap[i] != 0) + return 0; } + return 1; } -static mp_limb_t -gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) -{ - /* Maintain - - a = u0 A - v0 B - b = - u1 A + v1 B = (B - u1) A - (A - v1) B - */ - mp_limb_t u0 = 1; - mp_limb_t v0 = 0; - mp_limb_t u1 = 0; - mp_limb_t v1 = 1; - - mp_limb_t A = a; - mp_limb_t B = b; - - ASSERT (a >= b); - ASSERT (b > 0); - - for (;;) - { - mp_limb_t q; - - q = a / b; - a -= q * b; - - if (a == 0) - { - *up = B - u1; - *vp = A - v1; - return b; - } - u0 += q * u1; - v0 += q * v1; +/* Computes r = u0 x0 + u1 x1. Needs n = un + xn limbs of temporary + storage. Result is of size n-1, n or n+1, and the size is returned + (if inputs are non-normalized, result may be non-normalized too). - q = b / a; - b -= q * a; + No overlap between input and output is allowed, since rp is used + for temporary storage. */ - if (b == 0) - { - *up = u0; - *vp = v0; - return a; - } - u1 += q * u0; - v1 += q * v0; - } -} -#endif /* ! GCDEXT_1_USE_BINARY */ - -/* FIXME: Duplicated in gcd.c */ static mp_size_t -hgcd_tdiv (mp_ptr qp, - mp_ptr rp, mp_size_t *rsizep, - mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize) +addmul2_n (mp_ptr rp, + mp_srcptr u0, mp_srcptr u1, mp_size_t un, + mp_srcptr x0, mp_srcptr x1, mp_size_t xn, + mp_ptr tp) { - mp_size_t qsize; - mp_size_t rsize; - - mpn_tdiv_qr (qp, rp, 0, ap, asize, bp, bsize); - - rsize = bsize; - MPN_NORMALIZE (rp, rsize); - *rsizep = rsize; - - qsize = asize - bsize + 1; - qsize -= (qp[qsize - 1] == 0); - - if (qsize == 1 && qp[0] == 1) - return 0; - - return qsize; -} - -/* FIXME: Duplicated in hgcd.c */ -static mp_limb_t -mpn_addmul2_n_1 (mp_ptr rp, mp_size_t n, - mp_ptr ap, mp_limb_t u, - mp_ptr bp, mp_limb_t v) -{ - mp_limb_t h; mp_limb_t cy; + mp_size_t n; - h = mpn_mul_1 (rp, ap, n, u); - cy = mpn_addmul_1 (rp, bp, n, v); - h += cy; -#if GMP_NAIL_BITS == 0 - rp[n] = h; - return (h < cy); -#else /* GMP_NAIL_BITS > 0 */ - rp[n] = h & GMP_NUMB_MASK; - return h >> GMP_NUMB_BITS; -#endif /* GMP_NAIL_BITS > 0 */ -} - - -/* Computes u2 = u0 + q u1 - - Returns new size. - - FIXME: Notation in the function not quite consistent - FIXME: Severe code duplication with hgcd_update_uv */ - -static mp_size_t -hgcd_update_u (struct hgcd_row *r, mp_size_t usize, - mp_srcptr qp, mp_size_t qsize, - /* Limbs allocated for the new u, for sanity - checking */ - mp_size_t alloc) -{ - mp_srcptr u0p = r[0].uvp[0]; - mp_srcptr u1p = r[1].uvp[0]; - mp_ptr u2p = r[2].uvp[0]; - - ASSERT (usize < alloc); - - /* u1 = 0 is an exceptional case. Except for this, u1 should be - normalized. */ - - ASSERT ((usize == 1 && u1p[0] == 0) || u1p[usize - 1] != 0); - - /* Compute u2 = u0 + q u1 */ - - if (usize == 1 && u1p[0] == 0) - { - /* u1 == 0 is a special case, then q might be large, but it - doesn't matter. Can happen only when u0 = v1 = 1, u1 = v0 = - 0, and hence usize == 1. */ - MPN_COPY (u2p, u0p, usize); - } - else if (qsize == 0) - /* Represents a unit quotient */ - { - mp_limb_t cy = mpn_add_n (u2p, u0p, u1p, usize); - u2p[usize] = cy; - usize += (cy != 0); - } - else if (qsize == 1) - { - mp_limb_t cy; - - cy = mpn_mul_1 (u2p, u1p, usize, qp[0]); - cy += mpn_add_n (u2p, u2p, u0p, usize); - - u2p[usize] = cy; - usize += (cy != 0); - } - else - { - if (qsize <= usize) - mpn_mul (u2p, u1p, usize, qp, qsize); - else - mpn_mul (u2p, qp, qsize, u1p, usize); - - ASSERT_NOCARRY (mpn_add (u2p, - u2p, usize + qsize, - u0p, usize)); - - usize += qsize; - usize -= (u2p[usize - 1] == 0); - } - ASSERT (mpn_cmp (r[1].uvp[0], r[2].uvp[0], usize) <= 0); - ASSERT (r[2].uvp[0][usize - 1] != 0); - - return usize; -} - - -/* Computes Y = R * X. No overlap allowed. */ -static mp_size_t -hgcd2_mul_vector (struct hgcd_row *Y, - mp_size_t alloc, - const struct hgcd2_row *R, - const struct hgcd_row *X, mp_size_t n) -{ - unsigned i; - int grow = 0; - mp_limb_t h = 0; - - ASSERT (n < alloc); - - for (i = 0; i < 2; i++) - { - /* Set Y[i] = R[i, 0] X[0] + R[i,1] X[1] - = u X[0] + v X[0] */ - mp_limb_t cy; - - cy = mpn_addmul2_n_1 (Y[i].uvp[0], n, - X[0].uvp[0], R[i].u, - X[1].uvp[0], R[i].v); - - if (cy) - { - ASSERT (n + 2 <= alloc); - Y[i].uvp[0][n+1] = cy; - grow = 1; - } - else - h |= Y[i].uvp[0][n]; - } - if (grow) - return n + 2; - else - /* Don't add redundant zeroes */ - return n + (h != 0); -} - -/* Sets (a, b, c) <-- (b, c, a) */ -#define HGCD_SWAP3_LEFT(row) \ -do { \ - struct hgcd_row __hgcd_swap4_left_tmp = row[0]; \ - row[0] = row[1]; \ - row[1] = row[2]; \ - row[2] = __hgcd_swap4_left_tmp; \ -} while (0) - -/* Sets (a, b, c, d) <-- (c, d, a, b) */ -#define HGCD_SWAP4_2(row) \ -do { \ - struct hgcd_row __hgcd_swap4_2_tmp = row[0]; \ - row[0] = row[2]; \ - row[2] = __hgcd_swap4_2_tmp; \ - __hgcd_swap4_2_tmp = row[1]; \ - row[1] = row[3]; \ - row[3] = __hgcd_swap4_2_tmp; \ -} while (0) - -static mp_size_t -gcdext_lehmer_itch (mp_size_t asize, mp_size_t bsize) -{ - mp_size_t ralloc = asize + 1; - mp_size_t ualloc = bsize + 1; - - return 4 * ralloc + 4 * ualloc + asize; -} - -static mp_size_t -gcdext_lehmer (mp_ptr gp, mp_ptr up, mp_size_t *usize, - mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize, - mp_ptr tp, mp_size_t talloc) -{ - struct hgcd_row r[4]; - /* Size and sign of u fields. The largest u should be normalized to - this size, and except for the case u1 = 0, that is the latest - u. */ - int rsize; - int rsign; - - mp_ptr qp; - mp_size_t qsize; - mp_size_t ralloc = asize + 1; - mp_size_t ualloc = bsize + 1; - - struct hgcd2 hgcd; - int res; - - ASSERT (asize >= bsize); - ASSERT (asize > 1); - ASSERT (bsize > 0); - - ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); - - ASSERT (4 * ralloc + 4*ualloc + asize <= talloc); - - r[0].rp = tp; tp += ralloc; talloc -= ralloc; - r[1].rp = tp; tp += ralloc; talloc -= ralloc; - r[2].rp = tp; tp += ralloc; talloc -= ralloc; - r[3].rp = tp; tp += ralloc; talloc -= ralloc; - - /* Must zero out the u fields. We don't use the v fields. */ - MPN_ZERO (tp, 4 * ualloc); - - r[0].uvp[0] = tp; tp += ualloc; talloc -= ualloc; - r[1].uvp[0] = tp; tp += ualloc; talloc -= ualloc; - r[2].uvp[0] = tp; tp += ualloc; talloc -= ualloc; - r[3].uvp[0] = tp; tp += ualloc; talloc -= ualloc; - - qp = tp; tp += asize; talloc -= asize; - - res = mpn_hgcd2_lehmer_step (&hgcd, - ap, asize, - bp, bsize, - NULL); - - if (res == 0 || (res == 2 && hgcd.row[0].v == 0)) + if (xn >= un) { - qsize = hgcd_tdiv (qp, r[1].rp, &r[1].rsize, - ap, asize, - bp, bsize); - MPN_COPY (r[0].rp, bp, bsize); - r[0].rsize = bsize; - - r[0].uvp[0][0] = 0; - r[1].uvp[0][0] = 1; - rsign = -1; + mpn_mul (rp, x0, xn, u0, un); + mpn_mul (tp, x1, xn, u1, un); } else { - const struct hgcd2_row *s = hgcd.row + (res - 2); - rsign = hgcd.sign; - if (res == 3) - rsign = ~rsign; - - /* s[0] and s[1] correct. */ - r[0].rsize - = mpn_hgcd2_fix (r[0].rp, ralloc, - rsign, - s[0].u, ap, asize, - s[0].v, bp, bsize); - - r[1].rsize - = mpn_hgcd2_fix (r[1].rp, ralloc, - ~rsign, - s[1].u, ap, asize, - s[1].v, bp, bsize); - - r[0].uvp[0][0] = s[0].u; - r[1].uvp[0][0] = s[1].u; - } - rsize = 1; - - while (r[0].rsize >= 2 && r[1].rsize > 0) - { - res = mpn_hgcd2_lehmer_step (&hgcd, - r[0].rp, r[0].rsize, - r[1].rp, r[1].rsize, - NULL); - - if (res == 0 || (res == 2 && hgcd.row[0].v == 0)) - { - qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize, - r[0].rp, r[0].rsize, - r[1].rp, r[1].rsize); - rsize = hgcd_update_u (r, rsize, qp, qsize, ualloc); - HGCD_SWAP3_LEFT (r); - rsign = ~rsign; - } - else - { - const struct hgcd2_row *s = hgcd.row + (res - 2); - int sign = hgcd.sign; - if (res == 3) - sign = ~sign; - - /* s[0] and s[1] correct. */ - r[2].rsize - = mpn_hgcd2_fix (r[2].rp, ralloc, - sign, - s[0].u, r[0].rp, r[0].rsize, - s[0].v, r[1].rp, r[1].rsize); - - r[3].rsize - = mpn_hgcd2_fix (r[3].rp, ralloc, - ~sign, - s[1].u, r[0].rp, r[0].rsize, - s[1].v, r[1].rp, r[1].rsize); - - rsize = hgcd2_mul_vector (r + 2, ralloc, s, r, rsize); - rsign ^= sign; - HGCD_SWAP4_2 (r); - } + mpn_mul (rp, u0, un, x0, xn); + mpn_mul (tp, u1, un, x1, xn); } - if (r[1].rsize == 0) - { - MPN_NORMALIZE (r[0].uvp[0], rsize); - MPN_COPY (gp, r[0].rp, r[0].rsize); - MPN_COPY (up, r[0].uvp[0], rsize); + n = un + xn; + cy = mpn_add_n (rp, rp, tp, n); - *usize = (rsign >= 0) ? rsize : -rsize; - return r[0].rsize; - } + if (cy > 0) + rp[n++] = cy; else - { - mp_limb_t cy; - mp_limb_t u; - mp_limb_t v; - - gp[0] = gcdext_1 (&u, &v, r[0].rp[0], r[1].rp[0]); - cy = mpn_addmul2_n_1 (up, rsize, - r[0].uvp[0], u, - r[1].uvp[0], v); - rsize++; - if (cy) - up[rsize++] = cy; - else - MPN_NORMALIZE (up, rsize); + MPN_NORMALIZE (rp, n); - *usize = (rsign >= 0) ? rsize : -rsize; - return 1; - } + return n; } -/* Computes Y = R * X. No overlap allowed. - - Temporary space is needed for two numbers smaller than the - resulting matrix elements, i.e. bounded by 2*L <= N. - - FIXME: Severe code duplication with hgcd.c: hgcd_mul. */ +#define COMPUTE_V_ITCH(n) (2*(n) + 1) +/* Computes |v| = |(g - u a)| / b, where u may be positive or + negative, and v is of the opposite sign. a, b are of size n, u and + v at most size n, and v must have space for n+1 limbs. */ static mp_size_t -hgcd_mul_vector (struct hgcd_row *Y, mp_size_t alloc, - const struct hgcd_row *R, mp_size_t rsize, - const struct hgcd_row *X, mp_size_t xsize, - mp_ptr tp, mp_size_t talloc) -{ - unsigned i; - - mp_size_t ysize; - mp_limb_t h; - int grow; - - MPN_NORMALIZE (R[1].uvp[1], rsize); - /* u1 = 0 is an exceptional case. Except for this, u1 should be - normalized. */ - ASSERT ((xsize == 1 && X[1].uvp[0][0] == 0) - || X[1].uvp[0][xsize - 1] != 0); - - if (xsize == 1 && X[1].uvp[0][0] == 0) - { - /* Special case. Set Y[i, 0] = R[i, 0] */ - ASSERT (X[0].uvp[0][0] == 1); - - if (rsize > 1) - MPN_NORMALIZE (R[1].uvp[0], rsize); - MPN_COPY (Y[0].uvp[0], R[0].uvp[0], rsize); - MPN_COPY (Y[1].uvp[0], R[1].uvp[0], rsize); - - return rsize; - } - - ysize = rsize + xsize; - ASSERT (ysize <= talloc); - - h = 0; grow = 0; - - if (rsize >= xsize) - { - for (i = 0; i < 2; i++) - { - /* Set Y[i, 0] = R[i, 0] X[0, 0] + R[i,1] X[1, 0] */ - mp_limb_t cy; - - mpn_mul (Y[i].uvp[0], R[i].uvp[0], rsize, X[0].uvp[0], xsize); - mpn_mul (tp, R[i].uvp[1], rsize, X[1].uvp[0], xsize); - - cy = mpn_add_n (Y[i].uvp[0], Y[i].uvp[0], tp, ysize); - - if (cy) - { - ASSERT (ysize + 1 < alloc); - Y[i].uvp[0][ysize] = cy; - grow = 1; - } - else - h |= Y[i].uvp[0][ysize - 1]; - } - } - else - { - for (i = 0; i < 2; i++) - { - /* Set Y[i, 0] = R[i, 0] X[0, 0] + R[i,1] X[1, 0] */ - mp_limb_t cy; - - mpn_mul (Y[i].uvp[0], X[0].uvp[0], xsize, R[i].uvp[0], rsize); - mpn_mul (tp, X[1].uvp[0], xsize, R[i].uvp[1], rsize); - - cy = mpn_add_n (Y[i].uvp[0], Y[i].uvp[0], tp, ysize); - - if (cy) - { - ASSERT (ysize + 1 < alloc); - Y[i].uvp[0][ysize] = cy; - grow = 1; - } - else - h |= Y[i].uvp[0][ysize - 1]; - } - } - - if (grow) - ysize++; - else - ysize -= (h == 0); - - ASSERT ((ysize == 1 && Y[1].uvp[0][0] == 0) || Y[1].uvp[0][ysize - 1] != 0); - - return ysize; -} - -#define COMPUTE_V_ITCH(asize, bsize, usize) \ - ((usize) + (asize) + 1 + (bsize)) - -/* Computes |v| = |(c - u a)| / b, where u may be positive or negative, - and v is of the opposite sign. Requires that b, c, |u| <= a. */ -static mp_size_t -compute_v (mp_ptr vp, mp_size_t valloc, - mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize, - mp_srcptr cp, mp_size_t csize, +compute_v (mp_ptr vp, + mp_srcptr ap, mp_srcptr bp, mp_size_t n, + mp_srcptr gp, mp_size_t gn, mp_srcptr up, mp_size_t usize, - mp_ptr tp, mp_size_t talloc) + mp_ptr tp) { mp_size_t size; - mp_size_t vsize; - mp_ptr rp; - - ASSERT (asize); - ASSERT (bsize); - ASSERT (csize); - ASSERT (asize >= bsize); - -#if 0 - trace ("compute_v: a = %Nd\n" - " b = %Nd\n" - " c = %Nd\n" - " u = %Nd\n", - ap, asize, bp, bsize, cp, csize, up, usize); -#endif - - ASSERT (usize); - + mp_size_t an; + mp_size_t bn; + mp_size_t vn; + + ASSERT (n > 0); + ASSERT (gn > 0); + ASSERT (usize != 0); + size = ABS (usize); + ASSERT (size <= n); - ASSERT (size <= asize); - ASSERT (asize + size <= talloc); + an = n; + MPN_NORMALIZE (ap, an); - mpn_mul (tp, ap, asize, up, size); - size += asize; + if (an >= size) + mpn_mul (tp, ap, an, up, size); + else + mpn_mul (tp, up, size, ap, an); + + size += an; - ASSERT (csize <= size); + ASSERT (gn <= size); if (usize > 0) { - /* |v| = -v = (u a - c) / b */ + /* |v| = -v = (u a - g) / b */ - ASSERT_NOCARRY (mpn_sub (tp, tp, size, cp, csize)); + ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); MPN_NORMALIZE (tp, size); if (size == 0) return 0; @@ -838,495 +120,432 @@ compute_v (mp_ptr vp, mp_size_t valloc, else { /* usize < 0 */ /* |v| = v = (c - u a) / b = (c + |u| a) / b */ - mp_limb_t cy = mpn_add (tp, tp, size, cp, csize); + mp_limb_t cy = mpn_add (tp, tp, size, gp, gn); if (cy) - { - ASSERT (size < talloc); - tp[size++] = cy; - } + tp[size++] = cy; } /* Now divide t / b. There must be no remainder */ + bn = n; + MPN_NORMALIZE (bp, bn); + ASSERT (size >= bn); - ASSERT (size >= bsize); - ASSERT (size + bsize <= talloc); - rp = tp + size; - - vsize = size + 1 - bsize; - ASSERT (vsize <= valloc); + vn = size + 1 - bn; + ASSERT (vn <= n + 1); - mpn_tdiv_qr (vp, rp, 0, tp, size, bp, bsize); - MPN_NORMALIZE (vp, vsize); + /* FIXME: Use divexact. Or do the entire calculation mod 2^{n * + GMP_NUMB_BITS}. */ + mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn); + vn -= (vp[vn-1] == 0); /* Remainder must be zero */ #if WANT_ASSERT { mp_size_t i; - for (i = 0; i < bsize; i++) + for (i = 0; i < bn; i++) { - ASSERT (rp[i] == 0); + ASSERT (tp[i] == 0); } } #endif - return vsize; + return vn; } -static mp_size_t -gcdext_schoenhage_itch (mp_size_t asize, mp_size_t bsize) -{ - mp_size_t itch; - - mp_size_t ralloc = asize + 1; - mp_size_t ualloc = bsize + 1; - /* Input size for hgcd calls */ - mp_size_t halloc = (asize + 1) / 2; +/* Temporary storage: - /* Storage for the rows and quotient */ - mp_size_t rstorage = 4 * ralloc + 4 * ualloc + asize; + Initial division: Quotient of at most an - n + 1 <= an limbs. - /* Storage for hgcd calls */ - mp_size_t tstorage = mpn_hgcd_init_itch (halloc) - + qstack_itch (halloc) - + mpn_hgcd_itch (halloc); + Storage for u0 and u1: 2(n+1). - /* Storage needed for final gcdext_lehmer */ - mp_size_t lstorage - = gcdext_lehmer_itch (GCDEXT_SCHOENHAGE_THRESHOLD, - GCDEXT_SCHOENHAGE_THRESHOLD); + Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4) - /* Storage needed after final nhgcd_gcdext_lehmer */ - mp_size_t fstorage - = COMPUTE_V_ITCH (GCDEXT_SCHOENHAGE_THRESHOLD, - GCDEXT_SCHOENHAGE_THRESHOLD, - ualloc); + Storage for hgcd, input (n + 1)/2: 9 n/4 plus some. + + When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 3(n+1) for the cofactors. + + When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less. + + For the lehmer call after the loop, Let T denote + GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for + u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T + + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient. + +*/ - /* We need rstorage + MAX (tstorage, lstorage, fstorage) */ +/* Optimal choice of p seems difficult. In each iteration the division + * of work beteen hgcd and the updates of u0 and u1 depends on the + * current size of the u. It may be desirable to use a different + * choice of p in each iteration. Also the input size seems to matter; + * choosing p = n / 3 in the first iteration seems to improve + * performance slightly for input size just above the theshold, but + * degrade performance for larger inputs. */ +#define CHOOSE_P_1(n) ((n) / 2) +#define CHOOSE_P_2(n) ((n) / 3) - itch = tstorage; - if (lstorage > tstorage) - itch = lstorage; - if (fstorage > itch) - itch = fstorage; +mp_size_t +mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, + mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) +{ + mp_size_t talloc; + mp_size_t scratch; + mp_size_t matrix_scratch; + mp_size_t ualloc = n + 1; - return rstorage + itch; -} + mp_size_t un; + mp_ptr u0; + mp_ptr u1; -#if WANT_ASSERT -static void -sanity_check_row (mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize, - int sign, mp_size_t usize, - const struct hgcd_row *r) -{ - /* Check that x = u * a + v * b, for some v, i.e. that - x - u*a is divisible by b. */ - mp_srcptr up = r->uvp[0]; - mp_srcptr xp = r->rp; - mp_size_t xsize = r->rsize; mp_ptr tp; - mp_size_t tsize; - mp_ptr qp; - mp_size_t qsize; - mp_ptr rp; - mp_size_t i; + TMP_DECL; - TMP_MARK; - ASSERT (asize > 0 && ap[asize - 1] != 0); - ASSERT (bsize > 0 && bp[bsize - 1] != 0); - ASSERT (xsize == 0 || xp[xsize - 1] != 0); - ASSERT (MPN_LEQ_P (xp, xsize, ap, asize)); - ASSERT (MPN_LEQ_P (up, usize, bp, bsize)); + ASSERT (an >= n); + ASSERT (n > 0); - MPN_NORMALIZE (up, usize); - if (usize == 0) - { - ASSERT (MPN_EQUAL_P (xp, xsize, bp, bsize)); - return; - } - - tp = TMP_ALLOC_LIMBS (usize + asize + 1); - qp = TMP_ALLOC_LIMBS (usize + asize + 2 - bsize); - rp = TMP_ALLOC_LIMBS (bsize); - - mpn_mul (tp, ap, asize, up, usize); - tsize = asize + usize; - tsize -= (tp[tsize - 1] == 0); + TMP_MARK; - if (sign >= 0) - { - ASSERT_NOCARRY (mpn_sub (tp, tp, tsize, xp, xsize)); - MPN_NORMALIZE (tp, tsize); - } - else - { - mp_limb_t cy = mpn_add (tp, tp, tsize, xp, xsize); - tp[tsize] = cy; - tsize += (cy != 0); - } + /* FIXME: Check for small sizes first, before setting up temporary + storage etc. */ + talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); + + /* For initial division */ + scratch = an - n + 1; + if (scratch > talloc) + talloc = scratch; - if (tsize > 0) + if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) { - mpn_tdiv_qr (qp, rp, 0, tp, tsize, bp, bsize); - for (i = 0; i < bsize; i++) - ASSERT (rp[i] == 0); - qsize = tsize - bsize; - qsize += (qp[qsize] != 0); - ASSERT (MPN_LEQ_P (qp, qsize, ap, asize)); - } - TMP_FREE; -} -# define ASSERT_ROW(ap, asize, bp, bsize, sign, usize, r) \ -sanity_check_row (ap, asize, bp, bsize, sign, usize, r) - -#else /* !WANT_ASSERT */ -# define ASSERT_ROW(ap, asize, bp, bsize, sign, usize, r) -#endif /* !WANT_ASSERT */ + /* For hgcd loop. */ + mp_size_t hgcd_scratch; + mp_size_t update_scratch; + mp_size_t p1 = CHOOSE_P_1 (n); + mp_size_t p2 = CHOOSE_P_2 (n); + mp_size_t min_p = MIN(p1, p2); + mp_size_t max_p = MAX(p1, p2); + matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); + hgcd_scratch = mpn_hgcd_itch (n - min_p); + update_scratch = max_p + n - 1; + + scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); + if (scratch > talloc) + talloc = scratch; -static mp_size_t -gcdext_schoenhage (mp_ptr gp, mp_ptr up, mp_size_t *usizep, - mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize, - mp_ptr tp, mp_size_t talloc) -{ - mp_size_t scratch; - struct hgcd hgcd; - struct qstack quotients; - struct hgcd_row r[4]; + /* Final mpn_gcdext_lehmer_n call. Need space for u and for + copies of a and b. */ + scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) + + 3*GCDEXT_DC_THRESHOLD; - /* Size and sign of u fields. The largest u should be normalized to - this size, and except for the case u1 = 0, that is the latest - u. */ - int rsize; - int rsign; + if (scratch > talloc) + talloc = scratch; - mp_ptr qp; - mp_size_t qsize; - mp_size_t ralloc = asize + 1; - mp_size_t ualloc = bsize + 1; - - ASSERT (asize >= bsize); - ASSERT (bsize > 0); - - ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); - - ASSERT (4 * ralloc + 4*ualloc + asize <= talloc); - - r[0].rp = tp; tp += ralloc; talloc -= ralloc; - r[1].rp = tp; tp += ralloc; talloc -= ralloc; - r[2].rp = tp; tp += ralloc; talloc -= ralloc; - r[3].rp = tp; tp += ralloc; talloc -= ralloc; - - /* Must zero out the u fields */ - MPN_ZERO (tp, 4 * ualloc); - - r[0].uvp[0] = tp; tp += ualloc; talloc -= ualloc; - r[1].uvp[0] = tp; tp += ualloc; talloc -= ualloc; - r[2].uvp[0] = tp; tp += ualloc; talloc -= ualloc; - r[3].uvp[0] = tp; tp += ualloc; talloc -= ualloc; + /* Cofactors u0 and u1 */ + talloc += 2*(n+1); + } - qp = tp; tp += asize; talloc -= asize; + tp = TMP_ALLOC_LIMBS(talloc); - ASSERT (asize >= bsize); - ASSERT (bsize > 0); - MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize; - MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize; + if (an > n) + { + mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); - r[0].uvp[0][0] = 1; - r[1].uvp[0][0] = 0; + if (mpn_zero_p (ap, n)) + { + MPN_COPY (gp, bp, n); + *usizep = 0; + TMP_FREE; + return n; + } + } - /* We don't use the v fields. */ - rsize = 1; - rsign = 0; + if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) + { + mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); - scratch = mpn_hgcd_init_itch ((asize + 1) / 2); - ASSERT (scratch <= talloc); - mpn_hgcd_init (&hgcd, (asize + 1) / 2, tp); - tp += scratch; talloc -= scratch; + TMP_FREE; + return gn; + } + + MPN_ZERO (tp, 2*ualloc); + u0 = tp; tp += ualloc; + u1 = tp; tp += ualloc; { - mp_size_t nlimbs = qstack_itch ((asize + 1) / 2); + /* For the first hgcd call, there are no u updates, and it makes + some sense to use a different choice for p. */ + + /* FIXME: We could trim use of temporary storage, since u0 and u1 + are not used yet. For the hgcd call, we could swap in the u0 + and u1 pointers for the relevant matrix elements. We could also + use a specialized hgcd function which computes only the last + two elements of the matrix. */ + + struct hgcd_matrix M; + mp_size_t p = CHOOSE_P_1 (n); /* Same as for gcd. */ + mp_size_t nn; + + mpn_hgcd_matrix_init (&M, n - p, tp); + nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); + if (nn > 0) + { + ASSERT (M.n <= (n - p - 1)/2); + ASSERT (M.n + p <= (p + n - 1) / 2); - ASSERT (nlimbs <= talloc); - qstack_init ("ients, (asize + 1) / 2, tp, nlimbs); + /* Temporary storage 2 (p + M->n) <= p + n - 1 */ + n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); - tp += nlimbs; - talloc -= nlimbs; - scratch += nlimbs; + MPN_COPY (u0, M.p[1][0], M.n); + MPN_COPY (u1, M.p[1][1], M.n); + un = M.n; + while ( (u0[un-1] | u1[un-1] ) == 0) + un--; + } + else + { + /* mpn_hgcd has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + mp_size_t gn; + mp_size_t updated_un = 1; + + u1[0] = 1; + + /* Temporary storage n + 1 */ + n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, + u0, u1, &updated_un, tp); + if (n == 0) + { + TMP_FREE; + return gn; + } + + un = updated_un; + ASSERT (un < ualloc); + } } - - while (ABOVE_THRESHOLD (r[0].rsize, GCDEXT_SCHOENHAGE_THRESHOLD) - && r[1].rsize > 0) + + while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) { - mp_size_t k = r[0].rsize / 2; - int res; - - ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r); - ASSERT_ROW (ap, asize, bp, bsize, ~rsign, rsize, r + 1); - - if (r[1].rsize <= k) - goto euclid; + struct hgcd_matrix M; + mp_size_t p = CHOOSE_P_2 (n); + mp_size_t nn; - qstack_reset ("ients, r[0].rsize - k); - - res = mpn_hgcd (&hgcd, - r[0].rp + k, r[0].rsize - k, - r[1].rp + k, r[1].rsize - k, - "ients, - tp, talloc); - - if (res == 0 || res == 1) + mpn_hgcd_matrix_init (&M, n - p, tp); + nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); + if (nn > 0) { - euclid: - qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize, - r[0].rp, r[0].rsize, - r[1].rp, r[1].rsize); - rsize = hgcd_update_u (r, rsize, qp, qsize, ualloc); - ASSERT (rsize < ualloc); - - ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r + 2); - - HGCD_SWAP3_LEFT (r); - rsign = ~rsign; + mp_size_t n0, n1; + mp_ptr t0; + mp_ptr t1; + + t0 = tp + matrix_scratch; + ASSERT (M.n <= (n - p - 1)/2); + ASSERT (M.n + p <= (p + n - 1) / 2); + + /* Temporary storage 2 (p + M->n) <= p + n - 1 */ + n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); + + t1 = t0 + un; + + /* FIXME: This copying could be avoided by some swapping of + * pointers. May need more temporary storage, though. */ + MPN_COPY (t0, u0, un); + MPN_COPY (t1, u1, un); + + /* By the same analysis as for mpn_hgcd_matrix_mul */ + ASSERT (M.n + un <= ualloc); + + /* Temporary storage un */ + n0 = addmul2_n (u0, t0, t1, un, + M.p[0][0], M.p[1][0], M.n, t1 + un); + n1 = addmul2_n (u1, t0, t1, un, + M.p[0][1], M.p[1][1], M.n, t1 + un); + + if (n0 > un) + un = n0; + if (n1 > un) + un = n1; + + ASSERT (un < ualloc); + ASSERT ( (u0[un-1] | u1[un-1]) > 0); } else { - const struct hgcd_row *s = hgcd.row + (res - 2); - int sign = hgcd.sign; - if (res == 3) - sign = ~sign; - - /* s[0] and s[1] are correct */ - r[2].rsize - = mpn_hgcd_fix (k, r[2].rp, ralloc, - sign, hgcd.size, s, - r[0].rp, r[1].rp, - tp, talloc); - - r[3].rsize - = mpn_hgcd_fix (k, r[3].rp, ralloc, - ~sign, hgcd.size, s+1, - r[0].rp, r[1].rp, - tp, talloc); - - rsize = hgcd_mul_vector (r + 2, ualloc, s, hgcd.size, - r, rsize, tp, talloc); - ASSERT (rsize < ualloc); - - rsign ^= sign; - ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r + 2); - ASSERT_ROW (ap, asize, bp, bsize, ~rsign, rsize, r + 3); - - HGCD_SWAP4_2 (r); + /* mpn_hgcd has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + mp_size_t gn; + mp_size_t updated_un = un; + + /* Temporary storage n + 1 */ + n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, + u0, u1, &updated_un, tp); + if (n == 0) + { + TMP_FREE; + return gn; + } + + un = updated_un; + ASSERT (un < ualloc); } } - if (r[1].rsize == 0) + + if (mpn_zero_p (ap, n)) { - MPN_COPY (gp, r[0].rp, r[0].rsize); - MPN_NORMALIZE (r[0].uvp[0], rsize); - MPN_COPY (up, r[0].uvp[0], rsize); + MPN_COPY (gp, bp, n); + MPN_NORMALIZE (u0, un); + MPN_COPY (up, u0, un); + *usizep = -un; - *usizep = (rsign >= 0) ? rsize : - rsize; - return r[0].rsize; + TMP_FREE; + return n; } - else if (r[0].rsize == 1) + else if (mpn_zero_p (bp, n)) { - mp_limb_t u; - mp_limb_t v; - mp_limb_t cy; - - gp[0] = gcdext_1 (&u, &v, r[0].rp[0], r[1].rp[0]); + MPN_COPY (gp, ap, n); + MPN_NORMALIZE (u1, un); + MPN_COPY (up, u1, un); + *usizep = un; - /* g = u r0 + v r1 = (u u0 + v u1) a + (...) b */ - cy = mpn_addmul2_n_1 (up, rsize, - r[0].uvp[0], u, - r[1].uvp[0], v); - - rsize++; - if (cy) - up[rsize++] = cy; - else - MPN_NORMALIZE (up, rsize); + TMP_FREE; + return n; + } + else if (mpn_zero_p (u0, un)) + { + mp_size_t gn; + ASSERT (un == 1); + ASSERT (u1[0] == 1); - *usizep = (rsign >= 0) ? rsize : -rsize; - return 1; + /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ + gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); + TMP_FREE; + return gn; } else { - /* We have r0 = u0 a + v0 b, - r1 = u1 a + v1 b + /* We have A = ... a + ... b + B = u0 a + u1 b + + a = u1 A + ... B + b = -u0 A + ... B - Compute g = u r0 + v r1 = (u u0 + v u1) a + (...) b - In the expression (u u0 + v u1), we have + with bounds - u <= r1, - u0 <= b/r0 (except if r0 = a, which should never be the case here) - v <= r0 - u1 <= b/r0 - */ + |u0|, |u1| <= B / min(a, b) + + Compute g = u a + v b = (u u1 - v u0) A + (...) B + Here, u, v are bounded by - mp_size_t gsize; - mp_size_t usize; - mp_size_t vsize; - - /* u1 should be non-zero, and normalized */ - ASSERT (rsize); - ASSERT (r[1].uvp[0][rsize - 1] != 0); -#if WANT_TRACE - trace ("gcdext: \n" - "r0 = %Nd\n" - "r1 = %Nd\n" - "u0 = %Nd\n" - "u1 = %Nd\n", - r[0].rp, r[0].rsize, r[1].rp, r[1].rsize, - r[0].uvp[0], rsize, r[1].uvp[0], rsize); -#endif - /* We don't need the space for hgcd and the quotient stack any more */ - tp -= scratch; talloc += scratch; - - /* Stores u in r[2] and v in r[3] */ - gsize = gcdext_lehmer (gp, r[2].uvp[0], &usize, - r[0].rp, r[0].rsize, - r[1].rp, r[1].rsize, - tp, talloc); + |u| <= b, + |v| <= a + */ - if (usize == 0) + mp_size_t u0n; + mp_size_t u1n; + mp_size_t lehmer_un; + mp_size_t lehmer_vn; + mp_size_t gn; + + mp_ptr lehmer_up; + mp_ptr lehmer_vp; + int negate; + + lehmer_up = tp; tp += n; + + /* Call mpn_gcdext_lehmer_n with copies of a and b. */ + MPN_COPY (tp, ap, n); + MPN_COPY (tp + n, bp, n); + gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); + + u0n = un; + MPN_NORMALIZE (u0, u0n); + if (lehmer_un == 0) { - /* u == 0 ==> v = g / b == 1 ==> g = u1 a + (...) b */ + /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ + MPN_COPY (up, u0, u0n); + *usizep = -u0n; - MPN_NORMALIZE (r[1].uvp[0], rsize); - MPN_COPY (up, r[1].uvp[0], rsize); - *usizep = (rsign >= 0) ? - rsize : rsize; - - return gsize; + TMP_FREE; + return gn; } - /* Compute v = (g - s r0) / r1, storing it in r[3] */ - vsize = compute_v (r[3].uvp[0], ualloc, - r[0].rp, r[0].rsize, r[1].rp, r[1].rsize, - gp, gsize, - r[2].uvp[0], usize, - tp, talloc); + lehmer_vp = tp; + /* Compute v = (g - u a) / b */ + lehmer_vn = compute_v (lehmer_vp, + ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); - if (usize < 0) + if (lehmer_un > 0) + negate = 0; + else { - usize = - usize; - rsign = ~rsign; + lehmer_un = -lehmer_un; + negate = 1; } - /* It's possible that u0 = 0, u1 = 1 */ - if (rsize == 1 && r[0].uvp[0][0] == 0) - { - /* u0 == 0 ==> u u0 + v u1 = v */ - MPN_COPY (up, r[3].uvp[0], vsize); - *usizep = (rsign >= 0) ? vsize : - vsize; + u1n = un; + MPN_NORMALIZE (u1, u1n); - return gsize; + /* It's possible that u0 = 1, u1 = 0 */ + if (u1n == 0) + { + ASSERT (un == 1); + ASSERT (u0[0] == 1); + + /* u1 == 0 ==> u u1 + v u0 = v */ + MPN_COPY (up, lehmer_vp, lehmer_vn); + *usizep = negate ? lehmer_vn : - lehmer_vn; + + TMP_FREE; + return gn; } - /* Ok, now u0, u1, u are non-zero. We may still have v == 0 */ - ASSERT (usize + rsize <= ualloc); - ASSERT (vsize + rsize <= ualloc); + ASSERT (lehmer_un + u1n <= ualloc); + ASSERT (lehmer_vn + u0n <= ualloc); + + /* Now u0, u1, u are non-zero. We may still have v == 0 */ /* Compute u u0 */ - if (usize <= rsize) + if (lehmer_un <= u1n) /* Should be the common case */ - mpn_mul (up, - r[0].uvp[0], rsize, - r[2].uvp[0], usize); + mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); else - mpn_mul (up, - r[2].uvp[0], usize, - r[0].uvp[0], rsize); + mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); - usize += rsize; + un = u1n + lehmer_un; + un -= (up[un - 1] == 0); - /* There may be more than one zero limb, if #u0 < #u1 */ - MPN_NORMALIZE (up, usize); - ASSERT (usize < ualloc); - - if (vsize) + if (lehmer_vn > 0) { mp_limb_t cy; - /* Overwrites old r[2].uvp[0] value */ - if (vsize <= rsize) + /* Overwrites old u1 value */ + if (lehmer_vn <= u0n) /* Should be the common case */ - cy = mpn_mul (r[2].uvp[0], - r[1].uvp[0], rsize, - r[3].uvp[0], vsize); + mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); else - cy = mpn_mul (r[2].uvp[0], - r[3].uvp[0], vsize, - r[1].uvp[0], rsize); + mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); - vsize += rsize - (cy == 0); - ASSERT (vsize < ualloc); + u1n = u0n + lehmer_vn; + u1n -= (u1[u1n - 1] == 0); - if (vsize <= usize) - cy = mpn_add (up, up, usize, r[2].uvp[0], vsize); + if (u1n <= un) + { + cy = mpn_add (up, up, un, u1, u1n); + } else { - cy = mpn_add (up, r[2].uvp[0], vsize, up, usize); - usize = vsize; + cy = mpn_add (up, u1, u1n, up, un); + un = u1n; } - up[usize] = cy; - usize += (cy != 0); + up[un] = cy; + un += (cy != 0); - ASSERT (usize < ualloc); + ASSERT (un < ualloc); } - *usizep = (rsign >= 0) ? usize : -usize; + *usizep = negate ? -un : un; - return gsize; - } -} - -mp_size_t -mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, - mp_ptr ap, mp_size_t asize, mp_ptr bp, mp_size_t bsize) -{ - ASSERT (asize >= bsize); - ASSERT (bsize > 0); - - if (asize == 1) - { -#if GCDEXT_1_USE_BINARY - mp_limb_t v; - *gp = gcdext_1 (up, &v, ap[0], bp[0]); -#else - *gp = gcdext_1_u (up, ap[0], bp[0]); -#endif - *usizep = (up[0] != 0); - ASSERT(gp[0] != 0); - return 1; - } - else if (BELOW_THRESHOLD (asize, GCDEXT_SCHOENHAGE_THRESHOLD)) - { - mp_size_t gsize; - mp_ptr tp; - mp_size_t talloc = gcdext_lehmer_itch (asize, bsize); - TMP_DECL; - TMP_MARK; - - tp = TMP_ALLOC_LIMBS (talloc); - gsize = gcdext_lehmer (gp, up, usizep, ap, asize, bp, bsize, - tp, talloc); - TMP_FREE; - return gsize; - } - else - { - mp_size_t gsize; - mp_ptr tp; - mp_size_t talloc = gcdext_schoenhage_itch (asize, bsize); - TMP_DECL; - TMP_MARK; - - tp = TMP_ALLOC_LIMBS (talloc); - gsize = gcdext_schoenhage (gp, up, usizep, ap, asize, bp, bsize, - tp, talloc); TMP_FREE; - return gsize; + return gn; } } diff --git a/mpn/generic/gcdext_1.c b/mpn/generic/gcdext_1.c new file mode 100644 index 000000000..efade2b4c --- /dev/null +++ b/mpn/generic/gcdext_1.c @@ -0,0 +1,319 @@ +/* mpn_gcdext -- Extended Greatest Common Divisor. + +Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +/* Default to binary gcdext_1, since it is best on most current machines. + We should teach tuneup to choose the right gcdext_1. */ +#define GCDEXT_1_USE_BINARY 1 + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef NULL +# define NULL ((void *) 0) +#endif + +/* FIXME: Takes two single-word limbs. It could be extended to a + * function that accepts a bignum for the first input, and only + * returns the first co-factor. */ + +/* Returns g, u and v such that g = u A - v B. There are three + different cases for the result: + + g = u A - v B, 0 < u < b, 0 < v < a + g = A u = 1, v = 0 + g = B u = B, v = A - 1 + + We always return with 0 < u <= b, 0 <= v < a. +*/ +#if GCDEXT_1_USE_BINARY + +static mp_limb_t +gcdext_1_odd (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) +{ + mp_limb_t u0; + mp_limb_t v0; + mp_limb_t v1; + mp_limb_t u1; + + mp_limb_t B = b; + mp_limb_t A = a; + + /* Through out this function maintain + + a = u0 A - v0 B + b = u1 A - v1 B + + where A and B are odd. */ + + u0 = 1; v0 = 0; + u1 = b; v1 = a-1; + + if (A == 1) + { + *up = u0; *vp = v0; + return 1; + } + else if (B == 1) + { + *up = u1; *vp = v1; + return 1; + } + + while (a != b) + { + mp_limb_t mask; + + ASSERT (a % 2 == 1); + ASSERT (b % 2 == 1); + + ASSERT (0 < u0); ASSERT (u0 <= B); + ASSERT (0 < u1); ASSERT (u1 <= B); + + ASSERT (0 <= v0); ASSERT (v0 < A); + ASSERT (0 <= v1); ASSERT (v1 < A); + + if (a > b) + { + MP_LIMB_T_SWAP (a, b); + MP_LIMB_T_SWAP (u0, u1); + MP_LIMB_T_SWAP (v0, v1); + } + + ASSERT (a < b); + + /* Makes b even */ + b -= a; + + mask = - (mp_limb_t) (u1 < u0); + u1 += B & mask; + v1 += A & mask; + u1 -= u0; + v1 -= v0; + + ASSERT (b % 2 == 0); + + do + { + /* As b = u1 A + v1 B is even, while A and B are odd, + either both or none of u1, v1 is even */ + + ASSERT (u1 % 2 == v1 % 2); + + mask = -(u1 & 1); + u1 = u1 / 2 + ((B / 2) & mask) - mask; + v1 = v1 / 2 + ((A / 2) & mask) - mask; + + b /= 2; + } + while (b % 2 == 0); + } + + /* Now g = a = b */ + ASSERT (a == b); + ASSERT (u1 <= B); + ASSERT (v1 < A); + + ASSERT (A % a == 0); + ASSERT (B % a == 0); + ASSERT (u0 % (B/a) == u1 % (B/a)); + ASSERT (v0 % (A/a) == v1 % (A/a)); + + *up = u0; *vp = v0; + + return a; +} + +mp_limb_t +mpn_gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) +{ + unsigned shift = 0; + mp_limb_t g; + mp_limb_t u; + mp_limb_t v; + + /* We use unsigned values in the range 0, ... B - 1. As the values + are uniquely determined only modulo B, we can add B at will, to + get numbers in range or flip the least significant bit. */ + /* Deal with powers of two */ + while ((a | b) % 2 == 0) + { + a /= 2; b /= 2; shift++; + } + + if (b % 2 == 0) + { + unsigned k = 0; + + do { + b /= 2; k++; + } while (b % 2 == 0); + + g = gcdext_1_odd (&u, &v, a, b); + + while (k--) + { + /* We have g = u a + v b, and need to construct + g = u'a + v'(2b). + + If v is even, we can just set u' = u, v' = v/2 + If v is odd, we can set v' = (v + a)/2, u' = u + b + */ + + if (v % 2 == 0) + v /= 2; + else + { + u = u + b; + v = v/2 + a/2 + 1; + } + b *= 2; + } + } + else if (a % 2 == 0) + { + unsigned k = 0; + + do { + a /= 2; k++; + } while (a % 2 == 0); + + g = gcdext_1_odd (&u, &v, a, b); + + while (k--) + { + /* We have g = u a + v b, and need to construct + g = u'(2a) + v'b. + + If u is even, we can just set u' = u/2, v' = v. + If u is odd, we can set u' = (u + b)/2 + */ + + if (u % 2 == 0) + u /= 2; + else + { + u = u/2 + b/2 + 1; + v = v + a; + } + a *= 2; + } + } + else + /* Ok, both are odd */ + g = gcdext_1_odd (&u, &v, a, b); + + *up = u; + *vp = v; + + return g << shift; +} + +#else /* ! GCDEXT_1_USE_BINARY */ +static mp_limb_t +gcdext_1_u (mp_limb_t *up, mp_limb_t a, mp_limb_t b) +{ + /* Maintain + + a = u0 A mod B + b = - u1 A mod B + */ + mp_limb_t u0 = 1; + mp_limb_t u1 = 0; + mp_limb_t B = b; + + ASSERT (a >= b); + ASSERT (b > 0); + + for (;;) + { + mp_limb_t q; + + q = a / b; + a -= q * b; + + if (a == 0) + { + *up = B - u1; + return b; + } + u0 += q * u1; + + q = b / a; + b -= q * a; + + if (b == 0) + { + *up = u0; + return a; + } + u1 += q * u0; + } +} + +mp_limb_t +mpn_gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) +{ + /* Maintain + + a = u0 A - v0 B + b = - u1 A + v1 B = (B - u1) A - (A - v1) B + */ + mp_limb_t u0 = 1; + mp_limb_t v0 = 0; + mp_limb_t u1 = 0; + mp_limb_t v1 = 1; + + mp_limb_t A = a; + mp_limb_t B = b; + + ASSERT (a >= b); + ASSERT (b > 0); + + for (;;) + { + mp_limb_t q; + + q = a / b; + a -= q * b; + + if (a == 0) + { + *up = B - u1; + *vp = A - v1; + return b; + } + u0 += q * u1; + v0 += q * v1; + + q = b / a; + b -= q * a; + + if (b == 0) + { + *up = u0; + *vp = v0; + return a; + } + u1 += q * u0; + v1 += q * v0; + } +} +#endif /* ! GCDEXT_1_USE_BINARY */ diff --git a/mpn/generic/gcdext_lehmer.c b/mpn/generic/gcdext_lehmer.c new file mode 100644 index 000000000..34a503d19 --- /dev/null +++ b/mpn/generic/gcdext_lehmer.c @@ -0,0 +1,162 @@ +/* mpn_gcdext -- Extended Greatest Common Divisor. + +Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Temporary storage: 2*(n+1) for u. n+1 for the matrix-vector + multiplications (if hgcd2 succeeds). If hgcd fails, n+1 limbs are + needed for the division, with most n for the quotient, and n+1 for + the product q u0. In all, 4n + 3. */ + +mp_size_t +mpn_gcdext_lehmer_n (mp_ptr gp, mp_ptr up, mp_size_t *usize, + mp_ptr ap, mp_ptr bp, mp_size_t n, + mp_ptr tp) +{ + mp_size_t ualloc = n + 1; + + /* Keeps track of the second row of the reduction matrix + * + * M = (v0, v1 ; u0, u1) + * + * which correspond to the first column of the inverse + * + * M^{-1} = (u1, -v1; -u0, v0) + */ + + mp_size_t un; + mp_ptr u0; + mp_ptr u1; + + MPN_ZERO (tp, 2*ualloc); + u0 = tp; tp += ualloc; + u1 = tp; tp += ualloc; + + u1[0] = 1; un = 1; + + /* FIXME: Handle n == 2 differently, after the loop? */ + while (n >= 2) + { + struct hgcd_matrix1 M; + mp_limb_t ah, al, bh, bl; + mp_limb_t mask; + + mask = ap[n-1] | bp[n-1]; + ASSERT (mask > 0); + + if (mask & GMP_NUMB_HIGHBIT) + { + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else if (n == 2) + { + /* We use the full inputs without truncation, so we can + safely shift left. */ + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[1], ap[0]); + al = ap[0] << shift; + bh = MPN_EXTRACT_NUMB (shift, bp[1], bp[0]); + bl = bp[0] << shift; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); + al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); + bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); + bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); + } + + /* Try an mpn_nhgcd2 step */ + if (mpn_hgcd2 (ah, al, bh, bl, &M)) + { + n = mpn_hgcd_mul_matrix1_inverse_vector (&M, n, ap, bp, tp); + un = mpn_hgcd_mul_matrix1_vector(&M, un, u0, u1, tp); + } + else + { + /* mpn_hgcd2 has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + mp_size_t gn; + mp_size_t updated_un = un; + + /* Temporary storage n + 1 */ + n = mpn_gcdext_subdiv_step (gp, &gn, up, usize, ap, bp, n, + u0, u1, &updated_un, tp); + if (n == 0) + return gn; + + un = updated_un; + } + } + if (ap[0] == 0) + { + gp[0] = bp[0]; + + MPN_NORMALIZE (u0, un); + MPN_COPY (up, u0, un); + + *usize = -un; + return 1; + } + else if (bp[0] == 0) + { + gp[0] = ap[0]; + + MPN_NORMALIZE (u1, un); + MPN_COPY (up, u1, un); + + *usize = un; + return 1; + } + else + { + mp_limb_t uh, vh; + mp_limb_t u; + mp_limb_t v; + + gp[0] = mpn_gcdext_1 (&u, &v, ap[0], bp[0]); + + /* Set up = u u1 + v u0. Keep track of size, un grows by one or + two limbs. */ + uh = mpn_mul_1 (up, u1, un, u); + vh = mpn_addmul_1 (up, u0, un, v); + + if ( (uh | vh) > 0) + { + mp_limb_t cy; + uh += vh; + up[un++] = uh; + if (uh < vh) + up[un++] = 1; + } + + *usize = un; + return 1; + } +} diff --git a/mpn/generic/gcdext_subdiv_step.c b/mpn/generic/gcdext_subdiv_step.c new file mode 100644 index 000000000..8a4ba1f42 --- /dev/null +++ b/mpn/generic/gcdext_subdiv_step.c @@ -0,0 +1,188 @@ +/* gcdext_subdiv_step.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +static inline int +mpn_zero_p (mp_srcptr ap, mp_size_t n) +{ + mp_size_t i; + for (i = n - 1; i >= 0; i--) + { + if (ap[i] != 0) + return 0; + } + return 1; +} + +/* Used when mpn_hgcd or mpn_hgcd2 has failed. Then either one of a or + b is small, or the difference is small. Perform one subtraction + followed by one division. If the gcd is found, stores it in gp and + *gn, and returns zero. Otherwise, compute the reduced a and b, + return the new size, and cofactors. */ + +/* Temporary storage: Let N be a bound both for the inputs a, b, and + the cofactors u0, u1 after the division step. Then up to N is + needed for the quotient, and N+1 for the product q u0. All in all, + 2N + 1. */ +mp_size_t +mpn_gcdext_subdiv_step (mp_ptr gp, mp_size_t *gn, mp_ptr up, mp_size_t *usizep, + mp_ptr ap, mp_ptr bp, mp_size_t n, + mp_ptr u0, mp_ptr u1, mp_size_t *unp, mp_ptr tp) + +{ + mp_size_t an, bn, un; + mp_size_t qn; + mp_size_t u0n; + + int swapped; + + an = bn = n; + + ASSERT (an > 0); + ASSERT (ap[an-1] > 0 || bp[an-1] > 0); + + MPN_NORMALIZE (ap, an); + MPN_NORMALIZE (bp, bn); + + un = *unp; + + swapped = 0; + + if (UNLIKELY (an == 0)) + { + return_b: + MPN_COPY (gp, bp, bn); + *gn = bn; + + MPN_NORMALIZE (u0, un); + MPN_COPY (up, u0, un); + + *usizep = swapped ? un : -un; + + return 0; + } + else if (UNLIKELY (bn == 0)) + { + return_a: + MPN_COPY (gp, ap, an); + *gn = an; + + MPN_NORMALIZE (u1, un); + MPN_COPY (up, u1, un); + + *usizep = swapped ? -un : un; + + return 0; + } + + /* Arrange so that a > b, subtract an -= bn, and maintain + normalization. */ + if (an < bn) + { + MPN_PTR_SWAP (ap, an, bp, bn); + MP_PTR_SWAP (u0, u1); + swapped ^= 1; + } + else if (an == bn) + { + int c; + MPN_CMP (c, ap, bp, an); + if (UNLIKELY (c == 0)) + goto return_a; + else if (c < 0) + { + MP_PTR_SWAP (ap, bp); + MP_PTR_SWAP (u0, u1); + swapped ^= 1; + } + } + /* Reduce a -= b, u1 += u0 */ + ASSERT_NOCARRY (mpn_sub (ap, ap, an, bp, bn)); + MPN_NORMALIZE (ap, an); + ASSERT (an > 0); + + u1[un] = mpn_add_n (u1, u1, u0, un); + un += (u1[un] > 0); + + /* Arrange so that a > b, and divide a = q b + r */ + if (an < bn) + { + MPN_PTR_SWAP (ap, an, bp, bn); + MP_PTR_SWAP (u0, u1); + swapped ^= 1; + } + else if (an == bn) + { + int c; + MPN_CMP (c, ap, bp, an); + if (UNLIKELY (c == 0)) + goto return_a; + else if (c < 0) + { + MP_PTR_SWAP (ap, bp); + MP_PTR_SWAP (u0, u1); + swapped ^= 1; + } + } + + /* Reduce a -= q b, u1 += q u0 */ + qn = an - bn + 1; + mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn); + + if (mpn_zero_p (ap, bn)) + goto return_b; + + n = bn; + + /* Update u1 += q u0 */ + u0n = un; + MPN_NORMALIZE (u0, u0n); + + if (u0n > 0) + { + qn -= (tp[qn - 1] == 0); + + if (qn > u0n) + mpn_mul (tp + qn, tp, qn, u0, u0n); + else + mpn_mul (tp + qn, u0, u0n, tp, qn); + + if (qn + u0n > un) + { + ASSERT_NOCARRY (mpn_add (u1, tp + qn, qn + u0n, u1, un)); + un = qn + u0n; + un -= (u1[un-1] == 0); + } + else + { + u1[un] = mpn_add (u1, u1, un, tp + qn, qn + u0n); + un += (u1[un] > 0); + } + } + + *unp = un; + return n; +} diff --git a/mpn/generic/hgcd.c b/mpn/generic/hgcd.c index 8f1967b32..ae8053d77 100644 --- a/mpn/generic/hgcd.c +++ b/mpn/generic/hgcd.c @@ -4,7 +4,7 @@ SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. -Copyright 2003, 2004, 2005 Free Software Foundation, Inc. +Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -21,2125 +21,624 @@ License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ -#define WANT_TRACE 0 - -#if WANT_TRACE -# include <stdio.h> -# include <stdarg.h> -#endif - #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" -#if WANT_TRACE -static void -trace (const char *format, ...) +/* For input of size n, matrix elements are of size at most ceil(n/2) + - 1, but we need two limbs extra. */ +void +mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p) { - va_list args; - va_start (args, format); - gmp_vfprintf (stderr, format, args); - va_end (args); + mp_size_t s = (n+1)/2 + 1; + M->alloc = s; + M->n = 1; + MPN_ZERO (p, 4 * s); + M->p[0][0] = p; + M->p[0][1] = p + s; + M->p[1][0] = p + 2 * s; + M->p[1][1] = p + 3 * s; + + M->p[0][0][0] = M->p[1][1][0] = 1; } -#endif - -/* Comparison of _normalized_ numbers. */ - -#define MPN_EQUAL_P(ap, asize, bp, bsize) \ -((asize) == (bsize) && mpn_cmp ((ap), (bp), (asize)) == 0) - -#define MPN_LEQ_P(ap, asize, bp, bsize) \ -((asize) < (bsize) || ((asize) == (bsize) \ - && mpn_cmp ((ap), (bp), (asize)) <= 0)) - -#define MPN_LESS_P(ap, asize, bp, bsize) \ -((asize) < (bsize) || ((asize) == (bsize) \ - && mpn_cmp ((ap), (bp), (asize)) < 0)) -/* Extract one limb, shifting count bits left - ________ ________ - |___xh___||___xl___| - |____r____| - >count < - - The count includes any nail bits, so it should work fine if - count is computed using count_leading_zeros. -*/ - -#define MPN_EXTRACT_LIMB(count, xh, xl) \ - ((((xh) << ((count) - GMP_NAIL_BITS)) & GMP_NUMB_MASK) | \ - ((xl) >> (GMP_LIMB_BITS - (count)))) - - -/* Return -1 if a < x + y + z, - 0 if a = x + y + z, - 1 if a > x + y + z. */ -static int -mpn_cmp_sum3 (mp_srcptr ap, mp_size_t an, - mp_srcptr xp, mp_size_t xn, - mp_srcptr yp, mp_size_t yn, - mp_srcptr zp, mp_size_t zn) +/* Updated column COL, adding in column (1-COL). */ +static void +hgcd_matrix_update_1 (struct hgcd_matrix *M, unsigned col) { - mp_limb_t cy; + mp_limb_t c0, c1; + ASSERT (col < 2); - /* Check that all limbs beyond an are zero. This should be slightly - cheaper than fully normalizing all the input numbers. */ + c0 = mpn_add_n (M->p[0][col], M->p[0][0], M->p[0][1], M->n); + c1 = mpn_add_n (M->p[1][col], M->p[1][0], M->p[1][1], M->n); - while (xn > an) - if (xp[--xn] > 0) return -1; - while (yn > an) - if (yp[--yn] > 0) return -1; - while (zn > an) - if (zp[--zn] > 0) return -1; + M->p[0][col][M->n] = c0; + M->p[1][col][M->n] = c1; - /* Start by sorting so that xn >= yn >= zn. Six permutations, so we - can't get away with less than three comparisons, at least not for - the worst case. */ - - if (xn < yn) - MPN_SRCPTR_SWAP (xp, xn, yp, yn); - if (yn < zn) - MPN_SRCPTR_SWAP (yp, yn, zp, zn); - if (xn < yn) - MPN_SRCPTR_SWAP (xp, xn, yp, yn); - - ASSERT (an >= xn && xn >= yn && yn >= zn); - - /* Assume that a = x + y + z, and write the addition limb by limb. - - (c[1], a[0]) = x[0] + y[0] + z[0] + c[0] - (c[2], a[1]) = x[1] + y[1] + z[1] + c[1] - (c[k+1], a[k]) = x[k] + y[k] + z[k] + c[2] - ... - (c[n], a[n-1]) = x[n-1] + y[n-1] + z[n-1] + c[n-1] - - where the start and stop conditions are that c[0] = c[n] = 0. - Then we can start at the high end, iterating - - c[k] = (c[k+1], a[k]) - x[k] - y[k] - z[k] - - If equality holds, then 0 <= c[k] <= 2 for all k (since for - example 0xf + 0xf + 0xf + 2 = 0x2f). If we find c[k] < 0, then we - know that a < x + y + z, and if we find c[k] > 2, then we know a - > x + y + z. */ + M->n += (c0 | c1) != 0; + ASSERT (M->n < M->alloc); +} - cy = 0; +/* Updated column COL, adding in column Q * (1-COL). Temporary + * storage: qn + n <= M->alloc, where n is the size of the largest + * element in column 1 - COL. */ +static void +hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn, + unsigned col, mp_ptr tp) +{ + ASSERT (col < 2); - while (an > xn) + if (qn == 1) { - /* c[k] = (c[k+1], a[k]) */ - if (cy > 0) - return 1; + mp_limb_t q = qp[0]; + mp_limb_t c0, c1; - cy = ap[--an]; - } + c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q); + c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q); -#if GMP_NAIL_BITS >= 2 - while (an > yn) - { - if (cy > 1) - return 1; + M->p[0][col][M->n] = c0; + M->p[1][col][M->n] = c1; - cy = (cy << GMP_NUMB_BITS) + ap[--an]; - if (cy < xp[an]) - return -1; - cy -= xp[an]; + M->n += (c0 | c1) != 0; } - while (an > zn) + else { - mp_limb_t s; + unsigned row; - if (cy > 2) - return 1; + /* Carries for the unlikely case that we get both high words + from the multiplication and carries from the addition. */ + mp_limb_t c[2]; + mp_size_t n; - cy = (cy << GMP_NUMB_BITS ) + ap[--an]; - s = xp[an] + yp[an]; - if (cy < s) - return -1; - cy -= s; - } - while (an > 0) - { - mp_limb_t s; + /* The matrix will not necessarily grow in size by qn, so we + need normalization in order not to overflow M. */ - if (cy > 2) - return 1; - - cy = (cy << GMP_NUMB_BITS ) + ap[--an]; - s = xp[an] + yp[an] + zp[an]; - if (cy < s) - return -1; - cy -= s; - } -#else /* GMP_NAIL_BITS < 2 */ -#if GMP_NAIL_BITS == 1 -loselose -#endif - while (an > yn) - { - /* c[k] = (c[k+1], a[k]) - x[k] */ - if (cy > 1) - return 1; + for (n = M->n; n + qn > M->n; n--) + { + ASSERT (n > 0); + if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0) + break; + } + + ASSERT (qn + n <= M->alloc); - --an; + for (row = 0; row < 2; row++) + { + if (qn <= n) + mpn_mul (tp, M->p[row][1-col], n, qp, qn); + else + mpn_mul (tp, qp, qn, M->p[row][1-col], n); - if (cy == 1) + ASSERT (n + qn >= M->n); + c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n); + } + if (c[0] | c[1]) { - if (ap[an] >= xp[an]) - return 1; - cy = (ap[an] - xp[an]) & GMP_NUMB_MASK; + M->n = n + qn + 1; + M->p[0][col][n-1] = c[0]; + M->p[1][col][n-1] = c[1]; } else { - /* cy == 0 */ - if (ap[an] < xp[an]) - return -1; - else - cy = ap[an] - xp[an]; + n += qn; + n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0; + if (n > M->n) + M->n = n; } } - while (an > zn) - { - mp_limb_t sh, sl; - - /* c[k] = (c[k+1], a[k]) - x[k] - y[k] */ - if (cy > 2) - return 1; - - --an; - - sl = xp[an] + yp[an]; - sh = (sl < xp[an]); - - if (cy < sh || (cy == sh && ap[an] < sl)) - return -1; - - sl = ap[an] - sl; /* Monkey business */ - sh = cy - sh - (sl > ap[an]); - if (sh > 0) - return 1; - cy = sl; - } - while (an > 0) - { - mp_limb_t sh, sl; - if (cy > 2) - return 1; - - --an; - - sl = xp[an] + yp[an]; - sh = (sl < xp[an]); - - sl += zp[an]; - sh += sl < zp[an]; - - if (cy < sh || (cy == sh && ap[an] < sl)) - return -1; - sl = ap[an] - sl; /* Monkey business */ - sh = cy - sh - (sl > ap[an]); - if (sh > 0) - return 1; - cy = sl; - } -#endif /* GMP_NAIL_BITS < 2 */ - return cy > 0; -} - -/* Only the first row has v = 0, a = 1 * a + 0 * b */ -static inline int -hgcd_start_row_p (const struct hgcd_row *r, mp_size_t n) -{ - mp_size_t i; - mp_srcptr vp = r->uvp[1]; - - for (i = 0; i < n; i++) - if (vp[i] != 0) - return 0; - - return 1; + ASSERT (M->n < M->alloc); } -/* Called when r[0, 1, 2] >= W^M, r[3] < W^M. Returns the number of - remainders that satisfy Jebelean's criterion, i.e. find the largest k - such that - - r[k+1] >= max (-u[k+1], - v[k+1]) - - r[k] - r[k-1] >= max (u[k+1] - u[k], v[k+1] - v[k]) - - Return 0 on failure, i.e. if B or A mod B < W^M. Return 1 in case - r0 and r1 are correct, but we still make no progress because r0 = - A, r1 = B. - - Otherwise return 2, 3 or 4, the number of r:s that are correct. - */ -static int -hgcd_jebelean (const struct hgcd *hgcd, mp_size_t M) +/* Multiply M by M1 from the right. Since the M1 elements fit in + GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs + temporary space M->n */ +static void +hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1, + mp_ptr tp) { - mp_size_t L; - unsigned bit; - - ASSERT (hgcd->row[0].rsize > M); - ASSERT (hgcd->row[1].rsize > M); - ASSERT (hgcd->row[2].rsize > M); - ASSERT (hgcd->row[3].rsize <= M); - - ASSERT (MPN_LESS_P (hgcd->row[1].rp, hgcd->row[1].rsize, - hgcd->row[0].rp, hgcd->row[0].rsize)); - ASSERT (MPN_LESS_P (hgcd->row[2].rp, hgcd->row[2].rsize, - hgcd->row[1].rp, hgcd->row[1].rsize)); - ASSERT (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize, - hgcd->row[2].rp, hgcd->row[2].rsize)); - - ASSERT (mpn_cmp (hgcd->row[0].uvp[1], hgcd->row[1].uvp[1], hgcd->size) <= 0); - ASSERT (mpn_cmp (hgcd->row[1].uvp[1], hgcd->row[2].uvp[1], hgcd->size) <= 0); - ASSERT (mpn_cmp (hgcd->row[2].uvp[1], hgcd->row[3].uvp[1], hgcd->size) <= 0); - - /* The bound is really floor (N/2), which is <= M = ceil (N/2) */ - L = hgcd->size; - ASSERT (L <= M); - - ASSERT (L > 0); - ASSERT (hgcd->row[3].uvp[1][L - 1] != 0); - - bit = hgcd->sign < 0; - - /* Check r1 - r2 >= max (u2 - u1, v2 - v1) = {|u1| + |u2|, |v1| + |v2|}[bit] */ - - if (mpn_cmp_sum3 (hgcd->row[1].rp, hgcd->row[1].rsize, - hgcd->row[2].rp, hgcd->row[2].rsize, - hgcd->row[1].uvp[bit], L, - hgcd->row[2].uvp[bit], L) < 0) - return 2 - (hgcd_start_row_p (hgcd->row, hgcd->size)); - - /* Ok, r2 is correct */ - - /* Check r3 >= max (-u3, -v3) = (|u3|, |v3|)[bit] */ - if (hgcd->row[3].rsize > L) - /* Condition satisfied */ - ; - else + unsigned row; + mp_limb_t grow; + for (row = 0, grow = 0; row < 2; row++) { - mp_size_t size; - for (size = L; size > hgcd->row[3].rsize; size--) - { - if (hgcd->row[3].uvp[bit][size-1] != 0) - return 3; - } - if (mpn_cmp (hgcd->row[3].rp, hgcd->row[3].uvp[bit], size) < 0) - return 3; - } + mp_limb_t c0, c1; - /* Check r3 - r2 >= max(u3-u2, v3-v2) = {|u2| + |u3|, |v2| +|v3|}[1-bit] */ + /* Compute (u, u') <-- (r00 u + r10 u', r01 u + r11 u') as - if (mpn_cmp_sum3 (hgcd->row[2].rp, hgcd->row[2].rsize, - hgcd->row[3].rp, hgcd->row[3].rsize, - hgcd->row[2].uvp[bit ^ 1], L, - hgcd->row[3].uvp[bit ^ 1], L) < 0) - return 3; - - /* Ok, r3 is correct */ - return 4; -} + t = u + u *= r00 + u += r10 * u' + u' *= r11 + u' += r01 * t + */ + /* FIXME: Duplication with mpn_hgcd_mul_matrix1_vector. */ + MPN_COPY (tp, M->p[row][0], M->n); + c0 = mpn_mul_1 (M->p[row][0], M->p[row][0], M->n, M1->u[0][0]); + c0 += mpn_addmul_1 (M->p[row][0], M->p[row][1], M->n, M1->u[1][0]); + M->p[row][0][M->n] = c0; -/* Compute au + bv. u and v are single limbs, a and b are n limbs each. - Stores n+1 limbs in rp, and returns the (n+2)'nd limb. */ -/* FIXME: With nails, we can instead return limb n+1, possibly including - one non-zero nail bit. */ -static mp_limb_t -mpn_addmul2_n_1 (mp_ptr rp, mp_size_t n, - mp_srcptr ap, mp_limb_t u, - mp_srcptr bp, mp_limb_t v) -{ - mp_limb_t h; - mp_limb_t cy; + c1 = mpn_mul_1 (M->p[row][1], M->p[row][1], M->n, M1->u[1][1]); + c1 += mpn_addmul_1 (M->p[row][1], tp, M->n, M1->u[0][1]); + M->p[row][1][M->n] = c1; - h = mpn_mul_1 (rp, ap, n, u); - cy = mpn_addmul_1 (rp, bp, n, v); - h += cy; -#if GMP_NAIL_BITS == 0 - rp[n] = h; - return (h < cy); -#else /* GMP_NAIL_BITS > 0 */ - rp[n] = h & GMP_NUMB_MASK; - return h >> GMP_NUMB_BITS; -#endif /* GMP_NAIL_BITS > 0 */ -} - - -static inline void -qstack_drop (struct qstack *stack) -{ - ASSERT (stack->size_next); - stack->limb_next -= stack->size[--stack->size_next]; -} - -/* Get top element */ -static inline mp_size_t -qstack_get_0 (const struct qstack *stack, - mp_srcptr *qp) -{ - mp_size_t qsize; - ASSERT (stack->size_next); - - qsize = stack->size[stack->size_next - 1]; - *qp = stack->limb + stack->limb_next - qsize; - - return qsize; + grow |= (c0 | c1); + } + M->n += (grow != 0); + ASSERT (M->n < M->alloc); } -/* Get element just below the top */ -static inline mp_size_t -qstack_get_1 (const struct qstack *stack, - mp_srcptr *qp) -{ - mp_size_t qsize; - ASSERT (stack->size_next >= 2); +/* Perform a few steps, using some of mpn_hgcd2, subtraction and + division. Reduces the size by almost one limb or more, but never + below the given size s. Return new size for a and b, or 0 if no + more steps are possible. - qsize = stack->size[stack->size_next - 2]; - *qp = stack->limb + stack->limb_next - - stack->size[stack->size_next - 1] - - qsize; + If hgcd2 succeds, needs temporary space for hgcd_matrix_mul_1, M->n + limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2 + fails, needs space for the quotient, qn <= n - s + 1 limbs, for and + hgcd_matrix_update_q, qn + (size of the appropriate column of M) <= + resulting size of $. - return qsize; -} + If N is the input size to the calling hgcd, then s = floor(N/2) + + 1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1 + < N, so N is sufficient. +*/ -/* Adds d to the element on top of the stack */ -static void -qstack_adjust (struct qstack *stack, mp_limb_t d) +static mp_size_t +hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s, + struct hgcd_matrix *M, mp_ptr tp) { - mp_size_t qsize; + struct hgcd_matrix1 M1; + mp_limb_t mask; + mp_limb_t ah, al, bh, bl; + mp_size_t an, bn, qn; mp_ptr qp; + int col; - ASSERT (stack->size_next); + ASSERT (n > s); - ASSERT_QSTACK (stack); + mask = ap[n-1] | bp[n-1]; + ASSERT (mask > 0); - if (stack->limb_next >= stack->limb_alloc) + if (n == s + 1) { - qstack_rotate (stack, 1); - } - - ASSERT (stack->limb_next < stack->limb_alloc); + if (mask < 4) + goto subtract; - qsize = stack->size[stack->size_next - 1]; - qp = stack->limb + stack->limb_next - qsize; - - if (qsize == 0) - { - qp[0] = 1 + d; - stack->size[stack->size_next - 1] = 1; - stack->limb_next++; + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; } - else + else if (mask & GMP_NUMB_HIGHBIT) { - mp_limb_t cy = mpn_add_1 (qp, qp, qsize, d); - if (cy) - { - qp[qsize] = cy; - stack->size[stack->size_next - 1]++; - stack->limb_next++; - } + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; } - - ASSERT_QSTACK (stack); -} - -/* hgcd2 operations */ - -/* Computes P = R * S. No overlap allowed. */ -static mp_size_t -hgcd2_mul (struct hgcd_row *P, mp_size_t alloc, - const struct hgcd2_row *R, - const struct hgcd_row *S, mp_size_t n) -{ - int grow = 0; - mp_limb_t h = 0; - unsigned i; - unsigned j; - - ASSERT (n < alloc); - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - { - /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] - = u_i s0j + v_i s1j */ - mp_limb_t cy; - - cy = mpn_addmul2_n_1 (P[i].uvp[j], n, - S[0].uvp[j], R[i].u, - S[1].uvp[j], R[i].v); - if (cy) - { - ASSERT (n + 2 <= alloc); - P[i].uvp[j][n+1] = cy; - grow = 1; - } - else - h |= P[i].uvp[j][n]; - } - if (grow) - return n + 2; else - /* Don't add redundant zeroes */ - return n + (h != 0); -} - -unsigned -mpn_hgcd_max_recursion (mp_size_t n) -{ - int count; - - count_leading_zeros (count, (mp_limb_t) - (1 + n / (HGCD_SCHOENHAGE_THRESHOLD - 5))); - - return GMP_LIMB_BITS - count; -} - -mp_size_t -mpn_hgcd_init_itch (mp_size_t size) -{ - /* r0 <= a, r1, r2, r3 <= b, but for simplicity, we allocate asize + - 1 for all of them. The size of the uv:s are limited to asize / 2, - but we allocate one extra limb. */ - - return 4 * (size + 1) + 8 * ((size / 2) + 1); -} - -void -mpn_hgcd_init (struct hgcd *hgcd, - mp_size_t asize, - mp_limb_t *limbs) -{ - unsigned i; - unsigned j; - mp_size_t alloc = (asize / 2) + 1; - - hgcd->sign = 0; - - for (i = 0; i < 4; i++) - { - hgcd->row[i].rp = limbs; - hgcd->row[i].rsize = asize + 1; limbs += asize + 1; - } - - hgcd->alloc = alloc; - hgcd->size = alloc; - - for (i = 0; i < 4; i++) - for (j = 0; j < 2; j++) - { - hgcd->row[i].uvp[j] = limbs; - limbs += alloc; - } -} - -#if WANT_ASSERT -void -__gmpn_hgcd_sanity (const struct hgcd *hgcd, - mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize, - unsigned start, unsigned end) -{ - int sign; - unsigned i; - mp_size_t L = hgcd->size; - mp_ptr tp; - mp_size_t talloc; - mp_ptr t1p; - mp_ptr t2p; - const struct hgcd_row *r; - - ASSERT (asize >= bsize); - - ASSERT (L <= asize / 2); - ASSERT (L); - - ASSERT (L <= asize); - ASSERT (L <= bsize); - - /* NOTE: We really need only asize + bsize + 2*L, but since we're - * swapping the pointers around, we allocate 2*(asize + L). */ - talloc = 2*(asize + L); - tp = __GMP_ALLOCATE_FUNC_LIMBS (talloc); - t1p = tp; - t2p = t1p + (asize + L); - - sign = hgcd->sign; - if (start % 2) - sign = ~sign; - for (i = start, r = &hgcd->row[start]; i < end; i++, sign = ~sign, r++) { - mp_size_t t1size = asize + L; - mp_size_t t2size = bsize + L; - - mp_size_t k; - for (k = hgcd->size; k < hgcd->alloc; k++) - { - ASSERT (r->uvp[0][k] == 0); - ASSERT (r->uvp[1][k] == 0); - } - - mpn_mul (t1p, ap, asize, r->uvp[0], L); - mpn_mul (t2p, bp, bsize, r->uvp[1], L); - - if (sign < 0) - MPN_PTR_SWAP (t1p, t1size, t2p, t2size); + int shift; - MPN_NORMALIZE (t2p, t2size); - ASSERT (t2size <= t1size); - ASSERT_NOCARRY (mpn_sub (t1p, t1p, t1size, t2p, t2size)); - - MPN_NORMALIZE (t1p, t1size); - ASSERT (MPN_EQUAL_P (t1p, t1size, r->rp, r->rsize)); - } - __GMP_FREE_FUNC_LIMBS (tp, talloc); - for (i = start; i < end - 1; i++) - { - /* We should have strict inequality after each reduction step, - but we allow equal values for input. */ - ASSERT (MPN_LEQ_P (hgcd->row[i+1].rp, hgcd->row[i+1].rsize, - hgcd->row[i].rp, hgcd->row[i].rsize)); + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); + al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); + bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); + bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); } -} -#endif /* WANT_ASSERT */ - -/* Helper functions for hgcd */ -/* Sets (a, b, c, d) <-- (b, c, d, a) */ -#define HGCD_SWAP4_LEFT(row) \ -do { \ - struct hgcd_row __hgcd_swap4_left_tmp; \ - __hgcd_swap4_left_tmp = row[0]; \ - row[0] = row[1]; \ - row[1] = row[2]; \ - row[2] = row[3]; \ - row[3] = __hgcd_swap4_left_tmp; \ -} while (0) - -/* Sets (a, b, c, d) <-- (d, a, b, c) */ -#define HGCD_SWAP4_RIGHT(row) \ -do { \ - struct hgcd_row __hgcd_swap4_right_tmp; \ - __hgcd_swap4_right_tmp = row[3]; \ - row[3] = row[2]; \ - row[2] = row[1]; \ - row[1] = row[0]; \ - row[0] = __hgcd_swap4_right_tmp; \ -} while (0) - -/* Sets (a, b, c, d) <-- (c, d, a, b) */ -#define HGCD_SWAP4_2(row) \ -do { \ - struct hgcd_row __hgcd_swap4_2_tmp; \ - __hgcd_swap4_2_tmp = row[0]; \ - row[0] = row[2]; \ - row[2] = __hgcd_swap4_2_tmp; \ - __hgcd_swap4_2_tmp = row[1]; \ - row[1] = row[3]; \ - row[3] = __hgcd_swap4_2_tmp; \ -} while (0) - -/* Sets (a, b, c) <-- (b, c, a) */ -#define HGCD_SWAP3_LEFT(row) \ -do { \ - struct hgcd_row __hgcd_swap4_left_tmp; \ - __hgcd_swap4_left_tmp = row[0]; \ - row[0] = row[1]; \ - row[1] = row[2]; \ - row[2] = __hgcd_swap4_left_tmp; \ -} while (0) - -/* Computes P = R * S. No overlap allowed. - - Temporary space is needed for two numbers smaller than the - resulting matrix elements, i.e. bounded by 2*L <= N. */ -static mp_size_t -hgcd_mul (struct hgcd_row *P, mp_size_t alloc, - const struct hgcd_row *R, mp_size_t rsize, - const struct hgcd_row *S, mp_size_t ssize, - mp_ptr tp, mp_size_t talloc) -{ - unsigned i; - unsigned j; - - mp_size_t psize; - mp_limb_t h = 0; - int grow = 0; - MPN_NORMALIZE (R[1].uvp[1], rsize); - ASSERT (S[1].uvp[1][ssize - 1] != 0); - - psize = rsize + ssize; - ASSERT (psize <= talloc); - - if (rsize >= ssize) - { - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - { - /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */ - mp_limb_t cy; - - mpn_mul (P[i].uvp[j], R[i].uvp[0], rsize, S[0].uvp[j], ssize); - mpn_mul (tp, R[i].uvp[1], rsize, S[1].uvp[j], ssize); - - cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize); - - if (cy) - { - ASSERT (psize + 1 < alloc); - P[i].uvp[j][psize] = cy; - grow = 1; - } - else - h |= P[i].uvp[j][psize - 1]; - } - } - else + /* Try an mpn_hgcd2 step */ + if (mpn_hgcd2 (ah, al, bh, bl, &M1)) { - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - { - /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */ - mp_limb_t cy; - - mpn_mul (P[i].uvp[j], S[0].uvp[j], ssize, R[i].uvp[0], rsize); - mpn_mul (tp, S[1].uvp[j], ssize, R[i].uvp[1], rsize); - - cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize); - - if (cy) - { - ASSERT (psize + 1 < alloc); - P[i].uvp[j][psize] = cy; - grow = 1; - } - else - h |= P[i].uvp[j][psize - 1]; - } - } - - if (grow) - return psize + 1; - else - return psize - (h == 0); -} - -/* Computes R = W^k s->r + s->u A' - s->v B', which must be - non-negative. W denotes 2^(GMP_NUMB_BITS). Temporary space needed - is k + uvsize <= M + L = N. - - Must have v > 0, v >= u. */ - -mp_size_t -mpn_hgcd_fix (mp_size_t k, - mp_ptr rp, mp_size_t ralloc, - int sign, mp_size_t uvsize, - const struct hgcd_row *s, - mp_srcptr ap, - mp_srcptr bp, - mp_ptr tp, mp_size_t talloc) -{ - mp_size_t tsize; - mp_limb_t cy; - mp_size_t rsize; - mp_srcptr up; - mp_srcptr vp; + /* Multiply M <- M * M1 */ + hgcd_matrix_mul_1 (M, &M1, tp); - up = s->uvp[0]; vp = s->uvp[1]; - MPN_NORMALIZE (vp, uvsize); - ASSERT (uvsize > 0); - - if (sign < 0) - { - MP_SRCPTR_SWAP (up, vp); - MP_SRCPTR_SWAP (ap, bp); + /* Multiply M1^{-1} (a;b) */ + return mpn_hgcd_mul_matrix1_inverse_vector (&M1, n, ap, bp, tp); } - tsize = k + uvsize; + subtract: + /* There are two ways in which mpn_hgcd2 can fail. Either one of ah and + bh was too small, or ah, bh were (almost) equal. Perform one + subtraction step (for possible cancellation of high limbs), + followed by one division. */ - ASSERT (k + s->rsize <= ralloc); - ASSERT (tsize <= talloc); - ASSERT (tsize <= ralloc); + /* Since we must ensure that #(a-b) > s, we handle cancellation of + high limbs explicitly up front. (FIXME: Or is it better to just + subtract, normalize, and use an addition to undo if it turns out + the the difference is too small?) */ + for (an = n; an > s; an--) + if (ap[an-1] != bp[an-1]) + break; - ASSERT (rp != s->rp); - - /* r = W^k s + u a */ - if (uvsize <= k) - mpn_mul (rp, ap, k, up, uvsize); - else - mpn_mul (rp, up, uvsize, ap, k); + if (an == s) + return 0; - if (uvsize <= s->rsize) + /* Maintain a > b. When needed, swap a and b, and let col keep track + of how to update M. */ + if (ap[an-1] > bp[an-1]) { - cy = mpn_add (rp + k, s->rp, s->rsize, rp + k, uvsize); - rsize = k + s->rsize; + /* a is largest. In the subtraction step, we need to update + column 1 of M */ + col = 1; } else { - cy = mpn_add (rp + k, rp + k, uvsize, s->rp, s->rsize); - rsize = k + uvsize; - } - - if (cy) - { - ASSERT (rsize < ralloc); - rp[rsize++] = cy; + MP_PTR_SWAP (ap, bp); + col = 0; } - /* r -= v b */ - - if (uvsize <= k) - mpn_mul (tp, bp, k, vp, uvsize); - else - mpn_mul (tp, vp, uvsize, bp, k); - - ASSERT_NOCARRY (mpn_sub (rp, rp, rsize, tp, tsize)); - MPN_NORMALIZE (rp, rsize); - - return rsize; -} + bn = n; + MPN_NORMALIZE (bp, bn); + if (bn <= s) + return 0; + + /* We have #a, #b > s. When is it possible that #(a-b) < s? For + cancellation to happen, the numbers must be of the form -/* Compute r2 = r0 - q r1 */ -static void -hgcd_update_r (struct hgcd_row *r, mp_srcptr qp, mp_size_t qsize) -{ - mp_srcptr r0p = r[0].rp; - mp_srcptr r1p = r[1].rp; - mp_ptr r2p = r[2].rp; - mp_size_t r0size = r[0].rsize; - mp_size_t r1size = r[1].rsize; + a = x + 1, 0, ..., 0, al + b = x , GMP_NUMB_MAX, ..., GMP_NUMB_MAX, bl - ASSERT (MPN_LESS_P (r1p, r1size, r0p, r0size)); + where al, bl denotes the least significant k limbs. If al < bl, + then #(a-b) < k, and if also high(al) != 0, high(bl) != GMP_NUMB_MAX, + then #(a-b) = k. If al >= bl, then #(a-b) = k + 1. */ - if (qsize == 0) - { - ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r1p, r1size)); - } - else if (qsize == 1) + if (ap[an-1] == bp[an-1] + 1) { - mp_size_t size; - mp_limb_t cy = mpn_mul_1 (r2p, r1p, r1size, qp[0]); - size = r1size; + mp_size_t k; + int c; + for (k = an-1; k > s; k--) + if (ap[k-1] != 0 || bp[k-1] != GMP_NUMB_MAX) + break; - if (cy) + MPN_CMP (c, ap, bp, k); + if (c < 0) { - ASSERT (size < r0size); - r2p[size++] = cy; + mp_limb_t cy; + + /* The limbs from k and up are cancelled. */ + if (k == s) + return 0; + cy = mpn_sub_n (ap, ap, bp, k); + ASSERT (cy == 1); + an = k; } - - ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r2p, size)); - } - else - { - mp_size_t size = r1size + qsize; - ASSERT (size <= r0size + 1); - - if (qsize <= r1size) - mpn_mul (r2p, r1p, r1size, qp, qsize); else - mpn_mul (r2p, qp, qsize, r1p, r1size); - - if (size > r0size) { - ASSERT (size == r0size + 1); - size--; - ASSERT (r2p[size] == 0); + ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, k)); + ap[k] = 1; + an = k + 1; } - - ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r2p, size)); } + else + ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, an)); + + ASSERT (an > s); + ASSERT (ap[an-1] > 0); + ASSERT (bn > s); + ASSERT (bp[bn-1] > 0); + + hgcd_matrix_update_1 (M, col); - MPN_NORMALIZE (r[2].rp, r0size); - r[2].rsize = r0size; - - ASSERT (MPN_LESS_P (r2p, r0size, r1p, r1size)); -} - -/* Compute (u2, v2) = (u0, v0) + q (u1, v1) - Return the size of the largest u,v element. - Caller must ensure that usize + qsize <= available storage */ -static mp_size_t -hgcd_update_uv (struct hgcd_row *r, mp_size_t usize, - mp_srcptr qp, mp_size_t qsize) -{ - unsigned i; - mp_size_t grow; - - ASSERT (r[1].uvp[1][usize - 1] != 0); - - /* Compute u2 = u0 + q u1 */ - - if (qsize == 0) + if (an < bn) { - /* Represents a unit quotient */ - mp_limb_t cy; - - cy = mpn_add_n (r[2].uvp[0], r[0].uvp[0], r[1].uvp[0], usize); - r[2].uvp[0][usize] = cy; - - cy = mpn_add_n (r[2].uvp[1], r[0].uvp[1], r[1].uvp[1], usize); - r[2].uvp[1][usize] = cy; - grow = cy; + MPN_PTR_SWAP (ap, an, bp, bn); + col ^= 1; } - else if (qsize == 1) + else if (an == bn) { - mp_limb_t q = qp[0]; - for (i = 0; i < 2; i++) + int c; + MPN_CMP (c, ap, bp, an); + if (c < 0) { - mp_srcptr u0p = r[0].uvp[i]; - mp_srcptr u1p = r[1].uvp[i]; - mp_ptr u2p = r[2].uvp[i]; - mp_limb_t cy; - - /* Too bad we don't have an addmul_1 with distinct source and - destination */ - cy = mpn_mul_1 (u2p, u1p, usize, q); - cy += mpn_add_n (u2p, u2p, u0p, usize); - - u2p[usize] = cy; - grow = cy != 0; + MP_PTR_SWAP (ap, bp); + col ^= 1; } } - else - { - for (i = 0; i < 2; i++) - { - mp_srcptr u0p = r[0].uvp[i]; - mp_srcptr u1p = r[1].uvp[i]; - mp_ptr u2p = r[2].uvp[i]; - - if (qsize <= usize) - mpn_mul (u2p, u1p, usize, qp, qsize); - else - mpn_mul (u2p, qp, qsize, u1p, usize); - ASSERT_NOCARRY (mpn_add (u2p, u2p, usize + qsize, u0p, usize)); - grow = qsize - ((u2p[usize + qsize - 1]) == 0); - } - } + /* Divide a / b. */ + qn = an + 1 - bn; - usize += grow; + /* FIXME: We could use an approximate division, that may return a + too small quotient, and only guarantess that the size of r is + almost the size of b. FIXME: Let ap and remainder overlap. */ + mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn); + qn -= (tp[qn -1] == 0); - /* The values should be allocated with one limb margin */ - ASSERT (mpn_cmp (r[1].uvp[0], r[2].uvp[0], usize) <= 0); - ASSERT (mpn_cmp (r[1].uvp[1], r[2].uvp[1], usize) <= 0); - ASSERT (r[2].uvp[1][usize - 1] != 0); + /* Normalize remainder */ + an = bn; + for ( ; an > s; an--) + if (ap[an-1] > 0) + break; - return usize; -} - -/* Compute r0 = r2 + q r1, and the corresponding uv */ -static void -hgcd_backup (struct hgcd_row *r, mp_size_t usize, - mp_srcptr qp, mp_size_t qsize) -{ - mp_ptr r0p = r[0].rp; - mp_srcptr r1p = r[1].rp; - mp_srcptr r2p = r[2].rp; - mp_size_t r0size; - mp_size_t r1size = r[1].rsize; - mp_size_t r2size = r[2].rsize; - - mp_ptr u0p = r[0].uvp[0]; - mp_ptr v0p = r[0].uvp[1]; - mp_srcptr u1p = r[1].uvp[0]; - mp_srcptr v1p = r[1].uvp[1]; - mp_srcptr u2p = r[2].uvp[0]; - mp_srcptr v2p = r[2].uvp[1]; - - ASSERT (MPN_LESS_P (r2p, r2size, r1p, r1size)); - - if (qsize == 0) - { - /* r0 = r2 + r1 */ - mp_limb_t cy = mpn_add (r0p, r1p, r1size, r2p, r2size); - r0size = r1size; - if (cy) - r0p[r0size++] = cy; - - /* (u0,v0) = (u2,v2) - (u1, v1) */ - - ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u1p, usize)); - ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v1p, usize)); - } - else if (qsize == 1) + if (an <= s) { - /* r0 = r2 + q r1 - - Just like for mpn_addmul_1, the result is the same size as r1, or - one limb larger. */ - + /* Quotient is too large */ mp_limb_t cy; - cy = mpn_mul_1 (r0p, r1p, r1size, qp[0]); - cy += mpn_add (r0p, r0p, r1size, r2p, r2size); - - r0size = r1size; - if (cy) - r0p[r0size++] = cy; - - /* (u0,v0) = (u2,v2) - q (u1, v1) */ - - ASSERT_NOCARRY (mpn_mul_1 (u0p, u1p, usize, qp[0])); - ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize)); - - ASSERT_NOCARRY (mpn_mul_1 (v0p, v1p, usize, qp[0])); - ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v0p, usize)); - } - else - { - /* r0 = r2 + q r1 - - Result must be of size r1size + q1size - 1, or one limb - larger. */ - - mp_size_t size; - - r0size = r1size + qsize; - if (r1size >= qsize) - mpn_mul (r0p, r1p, r1size, qp, qsize); - else - mpn_mul (r0p, qp, qsize, r1p, r1size); - - ASSERT_NOCARRY (mpn_add (r0p, r0p, r0size, r2p, r2size)); + cy = mpn_add (ap, bp, bn, ap, an); - r0size -= (r0p[r0size-1] == 0); - - /* (u0,v0) = (u2,v2) - q (u1, v1) */ - - /* We must have - - usize >= #(q u1) >= qsize + #u1 - 1 - - which means that u1 must have at least - - usize - #u1 >= qsize - 1 - - zero limbs at the high end, and similarly for v1. */ - - ASSERT (qsize <= usize); - size = usize - qsize + 1; -#if WANT_ASSERT - { - mp_size_t i; - for (i = size; i < usize; i++) - { - ASSERT (u1p[i] == 0); - ASSERT (v1p[i] == 0); - } - } -#endif - /* NOTE: Needs an extra limb for the u,v values */ - - if (qsize <= size) - { - mpn_mul (u0p, u1p, size, qp, qsize); - mpn_mul (v0p, v1p, size, qp, qsize); - } - else + if (cy > 0) { - mpn_mul (u0p, qp, qsize, u1p, size); - mpn_mul (v0p, qp, qsize, v1p, size); + ASSERT (bn < n); + ap[bn] = cy; + bp[bn] = 0; + bn++; } - /* qsize + size = usize + 1 */ - ASSERT (u0p[usize] == 0); - ASSERT (v0p[usize] == 0); - - ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize)); - ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v0p, usize)); + MPN_DECR_U (tp, qn, 1); + qn -= (tp[qn-1] == 0); } - r[0].rsize = r0size; -} - -/* Called after HGCD_SWAP4_RIGHT, to adjust the size field. Large - numbers in row 0 don't count, and are overwritten. */ -static void -hgcd_normalize (struct hgcd *hgcd) -{ - mp_size_t size = hgcd->size; - - /* v3 should always be the largest element */ - while (size > 0 && hgcd->row[3].uvp[1][size - 1] == 0) - { - size--; - /* Row 0 is about to be overwritten. We must zero out unused limbs */ - hgcd->row[0].uvp[0][size] = 0; - hgcd->row[0].uvp[1][size] = 0; - - ASSERT (hgcd->row[1].uvp[0][size] == 0); - ASSERT (hgcd->row[1].uvp[1][size] == 0); - ASSERT (hgcd->row[2].uvp[0][size] == 0); - ASSERT (hgcd->row[2].uvp[1][size] == 0); - ASSERT (hgcd->row[3].uvp[0][size] == 0); - } + if (qn > 0) + hgcd_matrix_update_q (M, tp, qn, col, tp + qn); - hgcd->size = size; + return bn; } -int -mpn_hgcd2_lehmer_step (struct hgcd2 *hgcd, - mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize, - struct qstack *quotients) +/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M + with elements of size at most (n+1)/2 - 1. Returns new size of a, + b, or zero if no reduction is possible. */ +mp_size_t +mpn_hgcd_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n, + struct hgcd_matrix *M, mp_ptr tp) { - mp_limb_t ah; - mp_limb_t al; - mp_limb_t bh; - mp_limb_t bl; + mp_size_t s = n/2 + 1; + mp_size_t nn; - ASSERT (asize >= bsize); - ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); + ASSERT (n > s); + ASSERT (ap[n-1] > 0 || bp[n-1] > 0); - if (bsize < 2) + nn = hgcd_step (n, ap, bp, s, M, tp); + if (!nn) return 0; -#if 0 && WANT_TRACE - trace ("lehmer_step:\n" - " a = %Nd\n" - " b = %Nd\n", - ap, asize, bp, bsize); -#endif -#if WANT_TRACE - trace ("lehmer_step: asize = %d, bsize = %d\n", asize, bsize); -#endif - - /* The case asize == 2 is needed to take care of values that are - between one and two *full* limbs in size. */ - if (asize == 2 || (ap[asize-1] & GMP_NUMB_HIGHBIT)) - { - if (bsize < asize) - return 0; - - al = ap[asize - 2]; - ah = ap[asize - 1]; - - ASSERT (asize == bsize); - bl = bp[asize - 2]; - bh = bp[asize - 1]; - } - else - { - unsigned shift; - if (bsize + 1 < asize) - return 0; - - /* We want two *full* limbs */ - ASSERT (asize > 2); - - count_leading_zeros (shift, ap[asize-1]); -#if 0 && WANT_TRACE - trace ("shift = %d\n", shift); -#endif - if (bsize == asize) - bh = MPN_EXTRACT_LIMB (shift, bp[asize - 1], bp[asize - 2]); - else - { - ASSERT (asize == bsize + 1); - bh = bp[asize - 2] >> (GMP_LIMB_BITS - shift); - } - - bl = MPN_EXTRACT_LIMB (shift, bp[asize - 2], bp[asize - 3]); - - al = MPN_EXTRACT_LIMB (shift, ap[asize - 2], ap[asize - 3]); - ah = MPN_EXTRACT_LIMB (shift, ap[asize - 1], ap[asize - 2]); - } - -#if WANT_TRACE - trace ("lehmer_step: ah = %lx, al = %lx, bh = %lx, bl = %lx\n", - (unsigned long) ah, (unsigned long) al, - (unsigned long) bh, (unsigned long) bl); -#endif - return mpn_hgcd2 (hgcd, ah, al, bh, bl, quotients); -} - -/* Called when r2 has been computed, and it is too small. Top element - on the stack is r0/r1. One backup step is needed. */ -static int -hgcd_small_1 (struct hgcd *hgcd, mp_size_t M, - struct qstack *quotients) -{ - mp_srcptr qp; - mp_size_t qsize; - - if (hgcd_start_row_p (hgcd->row, hgcd->size)) + for (;;) { - qstack_drop (quotients); - return 0; + n = nn; + ASSERT (n > s); + nn = hgcd_step (n, ap, bp, s, M, tp); + if (!nn ) + return n; } - - HGCD_SWAP4_RIGHT (hgcd->row); - hgcd_normalize (hgcd); - - qsize = qstack_get_1 (quotients, &qp); - - hgcd_backup (hgcd->row, hgcd->size, qp, qsize); - hgcd->sign = ~hgcd->sign; - -#if WANT_ASSERT - qstack_rotate (quotients, 0); -#endif - - return hgcd_jebelean (hgcd, M); } -/* Called when r3 has been computed, and is small enough. Two backup - steps are needed. */ -static int -hgcd_small_2 (struct hgcd *hgcd, mp_size_t M, - const struct qstack *quotients) +/* Multiply M by M1 from the right. Needs 4*(M->n + M1->n) + 5 limbs + of temporary storage (see mpn_matrix22_mul_itch). */ +void +mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1, + mp_ptr tp) { - mp_srcptr qp; - mp_size_t qsize; + mp_size_t n; - if (hgcd_start_row_p (hgcd->row + 2, hgcd->size)) - return 0; + /* About the new size of M:s elements. Since M1's diagonal elements + are > 0, no element can decrease. The new elements are of size + M->n + M1->n, one limb more or less. The computation of the + matrix product produces elements of size M->n + M1->n + 1. But + the true size, after normalization, may be two limbs smaller. */ - qsize = qstack_get_0 (quotients, &qp); - hgcd_backup (hgcd->row+1, hgcd->size, qp, qsize); + /* FIXME: Strassen multiplication gives only a small speedup. In FFT + multiplication range, this function could be sped up quite a lot + using invariance. */ + ASSERT (M->n + M1->n < M->alloc); - if (hgcd_start_row_p (hgcd->row + 1, hgcd->size)) - return 0; + ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1] + | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0); - qsize = qstack_get_1 (quotients, &qp); - hgcd_backup (hgcd->row, hgcd->size, qp, qsize); + ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1] + | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0); - return hgcd_jebelean (hgcd, M); -} - -static void -hgcd_start (struct hgcd *hgcd, - mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize) -{ - MPN_COPY (hgcd->row[0].rp, ap, asize); - hgcd->row[0].rsize = asize; + mpn_matrix22_mul (M->p[0][0], M->p[0][1], + M->p[1][0], M->p[1][1], M->n, + M1->p[0][0], M1->p[0][1], + M1->p[1][0], M1->p[1][1], M1->n, tp); - MPN_COPY (hgcd->row[1].rp, bp, bsize); - hgcd->row[1].rsize = bsize; + n = M->n + M1->n + 1; + n -= ((M->p[0][0][n-1] | M->p[0][1][n-1] + | M->p[1][0][n-1] | M->p[1][1][n-1]) == 0); + n -= ((M->p[0][0][n-1] | M->p[0][1][n-1] + | M->p[1][0][n-1] | M->p[1][1][n-1]) == 0); - hgcd->sign = 0; - if (hgcd->size != 0) - { - /* We must zero out the uv array */ - unsigned i; - unsigned j; + ASSERT ((M->p[0][0][n-1] | M->p[0][1][n-1] + | M->p[1][0][n-1] | M->p[1][1][n-1]) > 0); - for (i = 0; i < 4; i++) - for (j = 0; j < 2; j++) - MPN_ZERO (hgcd->row[i].uvp[j], hgcd->size); - } -#if WANT_ASSERT - { - unsigned i; - unsigned j; - mp_size_t k; - - for (i = 0; i < 4; i++) - for (j = 0; j < 2; j++) - for (k = hgcd->size; k < hgcd->alloc; k++) - ASSERT (hgcd->row[i].uvp[j][k] == 0); - } -#endif - - hgcd->size = 1; - hgcd->row[0].uvp[0][0] = 1; - hgcd->row[1].uvp[1][0] = 1; + M->n = n; } -/* Performs one euclid step on r0, r1. Returns >= 0 if hgcd should be - terminated, -1 if we should go on */ -static int -euclid_step (struct hgcd *hgcd, mp_size_t M, - struct qstack *quotients) +/* Multiplies the least significant p limbs of (a;b) by M^-1. + Temporary space needed: 2 * (p + M->n)*/ +mp_size_t +mpn_hgcd_matrix_adjust (struct hgcd_matrix *M, + mp_size_t n, mp_ptr ap, mp_ptr bp, + mp_size_t p, mp_ptr tp) { - mp_size_t asize; + /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b) + = (r11 a - r01 b; - r10 a + r00 b */ - mp_size_t qsize; - mp_size_t rsize; - mp_ptr qp; - mp_ptr rp; + mp_ptr t0 = tp; + mp_ptr t1 = tp + p + M->n; + mp_limb_t ah, bh; + mp_limb_t cy; - asize = hgcd->row[0].rsize; - rsize = hgcd->row[1].rsize; - qsize = asize - rsize + 1; + ASSERT (p + M->n < n); - /* Make sure we have space on stack */ - ASSERT_QSTACK (quotients); + /* First compute the two values depending on a, before overwriting a */ - if (qsize > quotients->limb_alloc - quotients->limb_next) - { - qstack_rotate (quotients, - qsize - (quotients->limb_alloc - quotients->limb_next)); - ASSERT (quotients->size_next < QSTACK_MAX_QUOTIENTS); - } - else if (quotients->size_next >= QSTACK_MAX_QUOTIENTS) + if (M->n >= p) { - qstack_rotate (quotients, 0); + mpn_mul (t0, M->p[1][1], M->n, ap, p); + mpn_mul (t1, M->p[1][0], M->n, ap, p); } - - ASSERT (qsize <= quotients->limb_alloc - quotients->limb_next); - - qp = quotients->limb + quotients->limb_next; - - rp = hgcd->row[2].rp; - mpn_tdiv_qr (qp, rp, 0, hgcd->row[0].rp, asize, hgcd->row[1].rp, rsize); - MPN_NORMALIZE (rp, rsize); - hgcd->row[2].rsize = rsize; - - if (qp[qsize - 1] == 0) - qsize--; - - if (qsize == 1 && qp[0] == 1) - qsize = 0; - - quotients->size[quotients->size_next++] = qsize; - quotients->limb_next += qsize; - - ASSERT_QSTACK (quotients); - - /* Update u and v */ - ASSERT (hgcd->size + qsize <= hgcd->alloc); - hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, qp, qsize); - ASSERT (hgcd->size < hgcd->alloc); - - if (hgcd->row[2].rsize <= M) - return hgcd_small_1 (hgcd, M, quotients); else { - /* Keep this remainder */ - hgcd->sign = ~hgcd->sign; - - HGCD_SWAP4_LEFT (hgcd->row); - return -1; + mpn_mul (t0, ap, p, M->p[1][1], M->n); + mpn_mul (t1, ap, p, M->p[1][0], M->n); } -} -/* Called when values have been computed in r[0] and r[1], and the - latter value is too large, and we know that it's not much too - large. Returns the updated size for the uv matrix. */ -static mp_size_t -hgcd_adjust (struct hgcd_row *r, mp_size_t size, - struct qstack *quotients) -{ - mp_limb_t c0; - mp_limb_t c1; - mp_limb_t d; - - /* Compute the correct r1. We have r1' = r1 - d r0, and we always - have d = 1 or 2. */ + /* Update a */ + MPN_COPY (ap, t0, p); + ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n); - ASSERT_NOCARRY (mpn_sub (r[1].rp, r[1].rp, r[1].rsize, r[0].rp, r[0].rsize)); + if (M->n >= p) + mpn_mul (t0, M->p[0][1], M->n, bp, p); + else + mpn_mul (t0, bp, p, M->p[0][1], M->n); - MPN_NORMALIZE (r[1].rp, r[1].rsize); + cy = mpn_sub (ap, ap, n, t0, p + M->n); + ASSERT (cy <= ah); + ah -= cy; - if (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize)) - { - c0 = mpn_add_n (r[1].uvp[0], r[1].uvp[0], r[0].uvp[0], size); - c1 = mpn_add_n (r[1].uvp[1], r[1].uvp[1], r[0].uvp[1], size); - d = 1; - } + /* Update b */ + if (M->n >= p) + mpn_mul (t0, M->p[0][0], M->n, bp, p); else - { - ASSERT_NOCARRY (mpn_sub (r[1].rp, r[1].rp, r[1].rsize, r[0].rp, r[0].rsize)); - MPN_NORMALIZE (r[1].rp, r[1].rsize); - ASSERT (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize)); + mpn_mul (t0, bp, p, M->p[0][0], M->n); - c0 = mpn_addmul_1 (r[1].uvp[0], r[0].uvp[0], size, 2); - c1 = mpn_addmul_1 (r[1].uvp[1], r[0].uvp[1], size, 2); - d = 2; - } + MPN_COPY (bp, t0, p); + bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n); + cy = mpn_sub (bp, bp, n, t1, p + M->n); + ASSERT (cy <= bh); + bh -= cy; - /* FIXME: Can avoid branches */ - if (c1 != 0) + if (ah > 0 || bh > 0) { - r[1].uvp[0][size] = c0; - r[1].uvp[1][size] = c1; - size++; + ap[n] = ah; + bp[n] = bh; + n++; } else { - ASSERT (c0 == 0); + /* The subtraction can reduce the size by at most one limb. */ + if (ap[n-1] == 0 && bp[n-1] == 0) + n--; } - - /* Remains to adjust the quotient on stack */ - qstack_adjust (quotients, d); - - return size; + ASSERT (ap[n-1] > 0 || bp[n-1] > 0); + return n; } -/* Reduce using Lehmer steps. Called by mpn_hgcd when r1 has been - reduced to approximately the right size. Also used by - mpn_hgcd_lehmer. */ -static int -hgcd_final (struct hgcd *hgcd, mp_size_t M, - struct qstack *quotients) -{ - ASSERT (hgcd->row[0].rsize > M); - ASSERT (hgcd->row[1].rsize > M); - - /* Can be equal when called by hgcd_lehmer. */ - ASSERT (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize, - hgcd->row[0].rp, hgcd->row[0].rsize)); - - for (;;) - { - mp_size_t L = hgcd->row[0].rsize; - - struct hgcd2 R; - int res; - - if (L <= M + 2 - && (L < M + 2 || (hgcd->row[0].rp[M+1] & GMP_NUMB_HIGHBIT) == 0)) - break; - - res = mpn_hgcd2_lehmer_step (&R, - hgcd->row[0].rp, hgcd->row[0].rsize, - hgcd->row[1].rp, hgcd->row[1].rsize, - quotients); - - if (res == 0) - { - /* We must divide to make progress */ - res = euclid_step (hgcd, M, quotients); - - if (res >= 0) - return res; - } - else if (res == 1) - { - mp_size_t qsize; - - /* The quotient that has been computed for r2 is at most 2 - off. So adjust that, and avoid a full division. */ - qstack_drop (quotients); - - /* Top two rows of R must be the identity matrix, followed - by a row (1, q). */ - ASSERT (R.row[0].u == 1 && R.row[0].v == 0); - ASSERT (R.row[1].u == 0 && R.row[1].v == 1); - ASSERT (R.row[2].u == 1); - - qsize = (R.row[2].v != 0); - - hgcd_update_r (hgcd->row, &R.row[2].v, qsize); - hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, - &R.row[2].v, qsize); - ASSERT (hgcd->size < hgcd->alloc); - - if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize, - hgcd->row[2].rp, hgcd->row[2].rsize)) - hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients); - - ASSERT (hgcd->size < hgcd->alloc); - - hgcd->sign = ~hgcd->sign; - HGCD_SWAP4_LEFT (hgcd->row); - } - else - { - const struct hgcd2_row *s = R.row + (res - 2); - int sign = R.sign; - /* Max size after reduction, plus one */ - mp_size_t ralloc = hgcd->row[1].rsize + 1; - - if (res == 2) - { - qstack_drop (quotients); - qstack_drop (quotients); - } - else if (res == 3) - { - sign = ~sign; - qstack_drop (quotients); - } - - /* s[0] and s[1] correct. */ - hgcd->row[2].rsize - = mpn_hgcd2_fix (hgcd->row[2].rp, ralloc, - sign, - s[0].u, hgcd->row[0].rp, hgcd->row[0].rsize, - s[0].v, hgcd->row[1].rp, hgcd->row[1].rsize); - - hgcd->row[3].rsize - = mpn_hgcd2_fix (hgcd->row[3].rp, ralloc, - ~sign, - s[1].u, hgcd->row[0].rp, hgcd->row[0].rsize, - s[1].v, hgcd->row[1].rp, hgcd->row[1].rsize); - - hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc, - s, hgcd->row, hgcd->size); - hgcd->sign ^= sign; - - ASSERT (hgcd->row[2].rsize > M); - -#if WANT_ASSERT - switch (res) - { - default: - ASSERT_ALWAYS (0 == "Unexpected value of res"); - break; - case 2: - ASSERT (hgcd->row[2].rsize >= L - 1); - ASSERT (hgcd->row[3].rsize >= L - 2); - ASSERT (hgcd->row[2].rsize > M + 1); - ASSERT (hgcd->row[3].rsize > M); - break; - case 3: - ASSERT (hgcd->row[2].rsize >= L - 2); - ASSERT (hgcd->row[3].rsize >= L - 2); - ASSERT (hgcd->row[3].rsize > M); - break; - case 4: - ASSERT (hgcd->row[2].rsize >= L - 2); - ASSERT (hgcd->row[3].rsize < L || hgcd->row[3].rp[L-1] == 1); - break; - } -#endif - if (hgcd->row[3].rsize <= M) - { - /* Can happen only in the res == 4 case */ - ASSERT (res == 4); - - /* Backup two steps */ - ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); - - return hgcd_small_2 (hgcd, M, quotients); - } - - HGCD_SWAP4_2 (hgcd->row); - } - } - - ASSERT (hgcd->row[1].rsize > M); - - for (;;) - { -#if WANT_ASSERT - mp_size_t L = hgcd->row[0].rsize; -#endif - mp_size_t ralloc; - - mp_size_t qsize; - mp_srcptr qp; - - struct hgcd2 R; - int res; - - /* We don't want hgcd2 to pickup any bits below r0p[M-1], so - don't tell mpn_hgcd2_lehmer_step about them. */ - res = mpn_hgcd2_lehmer_step (&R, - hgcd->row[0].rp+M-1, hgcd->row[0].rsize-M+1, - hgcd->row[1].rp+M-1, hgcd->row[1].rsize-M+1, - quotients); - if (res == 0) - { - /* We must divide to make progress */ - res = euclid_step (hgcd, M, quotients); - - if (res >= 0) - return res; - - continue; - } - - if (res == 1) - { - mp_size_t qsize; - - /* The quotient that has been computed for r2 is at most 2 - off. So adjust that, and avoid a full division. */ - qstack_drop (quotients); - - /* Top two rows of R must be the identity matrix, followed - by a row (1, q). */ - ASSERT (R.row[0].u == 1 && R.row[0].v == 0); - ASSERT (R.row[1].u == 0 && R.row[1].v == 1); - ASSERT (R.row[2].u == 1); - - qsize = (R.row[2].v != 0); +/* Size analysis for hgcd: - hgcd_update_r (hgcd->row, &R.row[2].v, qsize); - hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, - &R.row[2].v, qsize); - ASSERT (hgcd->size < hgcd->alloc); + For the recursive calls, we have n1 <= ceil(n / 2). Then the + storage need is determined by the storage for the recursive call + computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1 + (after this, the storage needed for M1 can be recycled). - if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize, - hgcd->row[2].rp, hgcd->row[2].rsize)) - hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients); + Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1) + = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2, + and for the hgcd_matrix_mul, we may need 4 ceil(n/2) + 1. In total, + 4 * ceil(n/4) + 4 ceil(n/2) + 5 <= 12 ceil(n/4) + 5. - ASSERT (hgcd->size < hgcd->alloc); + For the recursive call, we need S(n1) = S(ceil(n/2)). - hgcd->sign = ~hgcd->sign; - HGCD_SWAP4_LEFT (hgcd->row); - - continue; - } - - /* Now r0 and r1 are always correct. */ - /* Store new values in rows 2 and 3, to avoid overlap */ - - /* Max size after reduction, plus one */ - ralloc = hgcd->row[1].rsize + 1; - - hgcd->row[2].rsize - = mpn_hgcd2_fix (hgcd->row[2].rp, ralloc, - R.sign, - R.row[0].u, hgcd->row[0].rp, hgcd->row[0].rsize, - R.row[0].v, hgcd->row[1].rp, hgcd->row[1].rsize); - - hgcd->row[3].rsize - = mpn_hgcd2_fix (hgcd->row[3].rp, ralloc, - ~R.sign, - R.row[1].u, hgcd->row[0].rp, hgcd->row[0].rsize, - R.row[1].v, hgcd->row[1].rp, hgcd->row[1].rsize); - - ASSERT (hgcd->row[2].rsize >= L - 1); - ASSERT (hgcd->row[3].rsize >= L - 2); - - ASSERT (hgcd->row[2].rsize > M); - ASSERT (hgcd->row[3].rsize > M-1); - - hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc, - R.row, hgcd->row, hgcd->size); - hgcd->sign ^= R.sign; - - if (hgcd->row[3].rsize <= M) - { - /* Backup two steps */ - - /* We don't use R.row[2] and R.row[3], so drop the - corresponding quotients. */ - qstack_drop (quotients); - qstack_drop (quotients); - - return hgcd_small_2 (hgcd, M, quotients); - } - - HGCD_SWAP4_2 (hgcd->row); - - if (res == 2) - { - qstack_drop (quotients); - qstack_drop (quotients); - - continue; - } - - /* We already know the correct q for computing r2 */ - - qsize = qstack_get_1 (quotients, &qp); - ASSERT (qsize < 2); - - ASSERT (qsize + hgcd->size <= hgcd->alloc); - hgcd_update_r (hgcd->row, qp, qsize); - hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, - qp, qsize); - ASSERT (hgcd->size < hgcd->alloc); - - ASSERT (hgcd->row[2].rsize >= M - 2); - - if (hgcd->row[2].rsize <= M) - { - /* Discard r3 */ - qstack_drop (quotients); - return hgcd_small_1 (hgcd, M, quotients); - } - if (res == 3) - { - /* Drop quotient for r3 */ - qstack_drop (quotients); - - hgcd->sign = ~hgcd->sign; - HGCD_SWAP4_LEFT (hgcd->row); - - continue; - } - - ASSERT (res == 4); - ASSERT (hgcd->row[2].rsize > M); - - /* We already know the correct q for computing r3 */ - qsize = qstack_get_0 (quotients, &qp); - ASSERT (qsize < 2); - - ASSERT (qsize + hgcd->size <= hgcd->alloc); - hgcd_update_r (hgcd->row + 1, qp, qsize); - hgcd->size = hgcd_update_uv (hgcd->row + 1, hgcd->size, - qp, qsize); - ASSERT (hgcd->size < hgcd->alloc); - - ASSERT (hgcd->row[3].rsize <= M + 1); - /* Appearantly not true. Probably because we have leading zeros - when we call hgcd2. */ - /* ASSERT (hgcd->row[3].rsize <= M || hgcd->row[3].rp[M] == 1); */ - - if (hgcd->row[3].rsize <= M) - return hgcd_jebelean (hgcd, M); - - HGCD_SWAP4_2 (hgcd->row); - } -} + S(n) <= 12*ceil(n/4) + 5 + S(ceil(n/2)) + <= 12*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 5k + S(ceil(n/2^k)) + <= 12*(2 ceil(n/4) + k) + 5k + S(n/2^k) + <= 24 ceil(n/4) + 17k + S(n/2^k) + +*/ mp_size_t -mpn_hgcd_itch (mp_size_t asize) +mpn_hgcd_itch (mp_size_t n) { - /* Scratch space is needed for calling hgcd. We need space for the - results of all recursive calls. In addition, we need space for - calling hgcd_fix and hgcd_mul, for which N = asize limbs should - be enough. */ + unsigned k; + int count; + mp_size_t nscaled; - /* Limit on the recursion depth */ - unsigned k = mpn_hgcd_max_recursion (asize); + if (BELOW_THRESHOLD (n, HGCD_THRESHOLD)) + return MPN_HGCD_LEHMER_ITCH (n); - return asize + mpn_hgcd_init_itch (asize + 6 * k) + 12 * k; -} + /* Get the recursion depth. */ + nscaled = (n - 1) / (HGCD_THRESHOLD - 1); + count_leading_zeros (count, nscaled); + k = GMP_LIMB_BITS - count; -/* Repeatedly divides A by B, until the remainder fits in M = - ceil(asize / 2) limbs. Stores cofactors in HGCD, and pushes the - quotients on STACK. On success, HGCD->row[0, 1, 2] correspond to - remainders that are larger than M limbs, while HGCD->row[3] - correspond to a remainder that fit in M limbs. - - Return 0 on failure (if B or A mod B fits in M limbs), otherwise - return one of 1 - 4 as specified for hgcd_jebelean. */ -int -mpn_hgcd (struct hgcd *hgcd, - mp_srcptr ap, mp_size_t asize, - mp_srcptr bp, mp_size_t bsize, - struct qstack *quotients, - mp_ptr tp, mp_size_t talloc) -{ - mp_size_t N = asize; - mp_size_t M = (N + 1)/2; - mp_size_t n; - mp_size_t m; - - struct hgcd R; - mp_size_t itch; + return 24 * ((n+3) / 4) + 17 * k + + MPN_HGCD_LEHMER_ITCH (HGCD_THRESHOLD); +} - ASSERT (M); -#if WANT_TRACE - trace ("hgcd: asize = %d, bsize = %d, HGCD_SCHOENHAGE_THRESHOLD = %d\n", - asize, bsize, HGCD_SCHOENHAGE_THRESHOLD); - if (asize < 100) - trace (" a = %Nd\n" - " b = %Nd\n", ap, asize, bp, bsize); -#endif +/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M + with elements of size at most (n+1)/2 - 1. Returns new size of a, + b, or zero if no reduction is possible. */ - if (bsize <= M) +mp_size_t +mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n, + struct hgcd_matrix *M, mp_ptr tp) +{ + mp_size_t s = n/2 + 1; + mp_size_t n2 = (3*n)/4 + 1; + + mp_size_t p, nn; + int success = 0; + + if (n <= s) + /* Happens when n <= 2, a fairly uninteresting case but exercised + by the random inputs of the testsuite. */ return 0; - ASSERT (asize >= 2); - - /* Initialize, we keep r0 and r1 as the reduced numbers (so far). */ - hgcd_start (hgcd, ap, asize, bp, bsize); - - if (BELOW_THRESHOLD (N, HGCD_SCHOENHAGE_THRESHOLD)) - return hgcd_final (hgcd, M, quotients); + ASSERT ((ap[n-1] | bp[n-1]) > 0); - /* Reduce the size to M + m + 1. Usually, only one hgcd call is - needed, but we may need multiple calls. When finished, the values - are stored in r0 (potentially large) and r1 (smaller size) */ + ASSERT ((n+1)/2 - 1 < M->alloc); - n = N - M; - m = (n + 1)/2; + if (BELOW_THRESHOLD (n, HGCD_THRESHOLD)) + return mpn_hgcd_lehmer (ap, bp, n, M, tp); - /* The second recursive call can use numbers of size up to n+3 */ - itch = mpn_hgcd_init_itch (n+3); - - ASSERT (itch <= talloc); - mpn_hgcd_init (&R, n+3, tp); - tp += itch; talloc -= itch; - - while (hgcd->row[1].rsize > M + m + 1) + p = n/2; + nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp); + if (nn > 0) { - /* Max size after reduction, plus one */ - mp_size_t ralloc = hgcd->row[1].rsize + 1; - - int res = mpn_hgcd (&R, - hgcd->row[0].rp + M, hgcd->row[0].rsize - M, - hgcd->row[1].rp + M, hgcd->row[1].rsize - M, - quotients, tp, talloc); - - if (res == 0) - { - /* We must divide to make progress */ - res = euclid_step (hgcd, M, quotients); - - if (res > 0) - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); - if (res >= 0) - return res; - - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); - } - else if (res <= 2) - { - /* The reason we use hgcd_adjust also when res == 2 is that - either r2 is correct, and we get it for free. - - Or r2 is too large. Then can correct it by a few bignum - subtractions, and we are *guaranteed* that the result is - small enough that we don't need another run through this - loop. */ - - /* FIXME: For res == 1, the newly computed row[2] will be - the same as the old row[1], so we do some unnecessary - computations. */ - - qstack_drop (quotients); - - /* Store new values in rows 2 and 3, to avoid overlap */ - hgcd->row[2].rsize - = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc, - ~R.sign, R.size, &R.row[1], - hgcd->row[0].rp, hgcd->row[1].rp, - tp, talloc); - - hgcd->row[3].rsize - = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc, - R.sign, R.size, &R.row[2], - hgcd->row[0].rp, hgcd->row[1].rp, - tp, talloc); - - ASSERT (hgcd->row[2].rsize > M); - ASSERT (hgcd->row[3].rsize > M); - - /* Computes the uv matrix for the (possibly incorrect) - values r1, r2. The elements must be smaller than the - correct ones, since they correspond to a too small q. */ - - hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc, - R.row + 1, R.size, - hgcd->row, hgcd->size, - tp, talloc); - hgcd->sign ^= ~R.sign; - - if (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize, - hgcd->row[2].rp, hgcd->row[2].rsize)) - { - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); - - HGCD_SWAP4_2 (hgcd->row); - } - else - { - /* r2 was too large, i.e. q0 too small. In this case we - must have r2 % r1 <= r2 - r1 smaller than M + m + 1. */ - - hgcd->size = hgcd_adjust (hgcd->row + 2, hgcd->size, quotients); - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); - - ASSERT (hgcd->row[3].rsize <= M + m + 1); - - if (hgcd->row[3].rsize <= M) - { - /* Backup two steps */ - ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); - - return hgcd_small_2 (hgcd, M, quotients); - } - - HGCD_SWAP4_2 (hgcd->row); - - /* Loop always terminates here. */ - break; - } - } - else if (res == 3) - { - qstack_drop(quotients); - - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); - - /* Store new values in rows 2 and 3, to avoid overlap */ - hgcd->row[2].rsize - = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc, - ~R.sign, R.size, &R.row[1], - hgcd->row[0].rp, hgcd->row[1].rp, - tp, talloc); - - hgcd->row[3].rsize - = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc, - R.sign, R.size, &R.row[2], - hgcd->row[0].rp, hgcd->row[1].rp, - tp, talloc); - - ASSERT (hgcd->row[2].rsize > M); - ASSERT (hgcd->row[3].rsize > M); - - hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc, - R.row + 1, R.size, - hgcd->row, hgcd->size, - tp, talloc); - hgcd->sign ^= ~R.sign; - - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); - - HGCD_SWAP4_2 (hgcd->row); - } - else - { - ASSERT (res == 4); - - /* All of r0, r1, r3 and r3 are correct. - Compute r2 and r3 */ - - ASSERT_HGCD (&R, - hgcd->row[0].rp + M, hgcd->row[0].rsize - M, - hgcd->row[1].rp + M, hgcd->row[1].rsize - M, - 0, 4); - - /* Store new values in rows 2 and 3, to avoid overlap */ - hgcd->row[2].rsize - = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc, - R.sign, R.size, &R.row[2], - hgcd->row[0].rp, hgcd->row[1].rp, - tp, talloc); - - hgcd->row[3].rsize - = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc, - ~R.sign, R.size, &R.row[3], - hgcd->row[0].rp, hgcd->row[1].rp, - tp, talloc); - - ASSERT (hgcd->row[2].rsize > M); - ASSERT (hgcd->row[3].rsize <= M + m + 1); - - hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc, - R.row+2, R.size, - hgcd->row, hgcd->size, - tp, talloc); - hgcd->sign ^= R.sign; - - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); - - if (hgcd->row[3].rsize <= M) - { - /* Backup two steps */ - /* Both steps must always be possible, but it's not - trivial to ASSERT that here. */ - ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); - - return hgcd_small_2 (hgcd, M, quotients); - } - HGCD_SWAP4_2 (hgcd->row); - - /* Always exit the loop. */ - break; - } + /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1) + = 2 (n - 1) */ + n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp); + success = 1; } - - ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize); - ASSERT (hgcd->row[1].rsize > M); - ASSERT (hgcd->row[1].rsize <= M + m + 1); - - if (hgcd->row[0].rsize > M + m + 1) + while (n > n2) { - /* One euclid step to reduce size. */ - int res = euclid_step (hgcd, M, quotients); - - if (res > 0) - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); - if (res >= 0) - return res; - - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); + /* Needs n + 1 storage */ + nn = hgcd_step (n, ap, bp, s, M, tp); + if (!nn) + return success ? n : 0; + n = nn; + success = 1; } - ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize); - ASSERT (hgcd->row[0].rsize <= M + m + 1); - ASSERT (hgcd->row[1].rsize > M); - - /* Second phase, reduce size until we have one number of size > M - and one of size <= M+1 */ - while (hgcd->row[1].rsize > M + 1) + if (n > s + 2) { - mp_size_t k = 2*M - hgcd->row[0].rsize; -#if WANT_ASSERT - mp_size_t n1 = hgcd->row[0].rsize - k; -#endif - mp_size_t qsize; - mp_srcptr qp; - int res; - - ASSERT (k + (n1 + 1)/2 == M); - ASSERT (n1 >= 2); - - ASSERT (n1 <= 2*(m + 1)); - ASSERT (n1 <= n + 3); - - res = mpn_hgcd (&R, - hgcd->row[0].rp + k, hgcd->row[0].rsize - k, - hgcd->row[1].rp + k, hgcd->row[1].rsize - k, - quotients, tp, talloc); - - if (res == 0) - { - /* The first remainder was small. Then there's a good chance - that the remainder A % B is also small. */ - res = euclid_step (hgcd, M, quotients); + struct hgcd_matrix M1; + mp_size_t scratch; - if (res > 0) - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); - if (res >= 0) - return res; + p = 2*s - n + 1; + scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p); - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); - continue; - } - - if (res == 1) + mpn_hgcd_matrix_init(&M1, n - p, tp); + nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch); + if (nn > 0) { - mp_srcptr qp; - mp_size_t qsize; - - qstack_drop (quotients); - - /* Compute possibly incorrect r2 and corresponding u2, v2. - Incorrect matrix elements must be smaller than the - correct ones, since they correspond to a too small q. */ - qsize = qstack_get_0 (quotients, &qp); - - ASSERT (qsize + hgcd->size <= hgcd->alloc); - hgcd_update_r (hgcd->row, qp, qsize); - hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, - qp, qsize); - ASSERT (hgcd->size < hgcd->alloc); - - if (!MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize, - hgcd->row[2].rp, hgcd->row[2].rsize)) - hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients); - - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 3); - - if (hgcd->row[2].rsize <= M) - { - /* Backup one steps */ - ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); - - return hgcd_small_1 (hgcd, M, quotients); - } - - HGCD_SWAP4_LEFT (hgcd->row); - hgcd->sign = ~hgcd->sign; - continue; + /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */ + ASSERT (M->n + 2 >= M1.n); + + /* Furthermore, assume M ends with a quotient (1, q; 0, 1), + then either q or q + 1 is a correct quotient, and M1 will + start with either (1, 0; 1, 1) or (2, 1; 1, 1). This + rules out the case that the size of M * M1 is much + smaller than the expected M->n + M1->n. */ + + ASSERT (M->n + M1.n < M->alloc); + + /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1) + = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */ + n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch); + /* Needs 4 ceil(n/2) + 1 */ + mpn_hgcd_matrix_mul (M, &M1, tp + scratch); + success = 1; } - - /* Now r0 and r1 are always correct. */ - - /* It's possible that first two "new" r:s are the same as the - old ones. In that case skip recomputing them. */ - - if (!hgcd_start_row_p (&R.row[0], R.size)) - { - /* Store new values in rows 2 and 3, to avoid overlap */ - hgcd->row[2].rsize - = mpn_hgcd_fix (k, hgcd->row[2].rp, hgcd->row[0].rsize + 1, - R.sign, R.size, &R.row[0], - hgcd->row[0].rp, hgcd->row[1].rp, - tp, talloc); - - hgcd->row[3].rsize - = mpn_hgcd_fix (k, hgcd->row[3].rp, hgcd->row[1].rsize + 1, - ~R.sign, R.size, &R.row[1], - hgcd->row[0].rp, hgcd->row[1].rp, - tp, talloc); - - ASSERT (hgcd->row[2].rsize > M); - ASSERT (hgcd->row[3].rsize > k); - - hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc, - R.row, R.size, hgcd->row, hgcd->size, - tp, talloc); - hgcd->sign ^= R.sign; - - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); - - if (hgcd->row[3].rsize <= M) - { - /* Backup two steps */ - - /* We don't use R.row[2] and R.row[3], so drop the - corresponding quotients. */ - qstack_drop (quotients); - qstack_drop (quotients); - - return hgcd_small_2 (hgcd, M, quotients); - } - - HGCD_SWAP4_2 (hgcd->row); - - if (res == 2) - { - qstack_drop (quotients); - qstack_drop (quotients); - - continue; - } - } - - ASSERT (res >= 3); - - /* We already know the correct q */ - qsize = qstack_get_1 (quotients, &qp); - - ASSERT (qsize + hgcd->size <= hgcd->alloc); - hgcd_update_r (hgcd->row, qp, qsize); - hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, - qp, qsize); - ASSERT (hgcd->size < hgcd->alloc); - - ASSERT (hgcd->row[2].rsize > k); - if (hgcd->row[2].rsize <= M) - { - /* Discard r3 */ - qstack_drop (quotients); - return hgcd_small_1 (hgcd, M, quotients); - } - if (res == 3) - { - /* Drop quotient for r3 */ - qstack_drop (quotients); - hgcd->sign = ~hgcd->sign; - HGCD_SWAP4_LEFT (hgcd->row); - - continue; - } - - ASSERT (hgcd->row[2].rsize > M); - ASSERT (res == 4); - - /* We already know the correct q */ - qsize = qstack_get_0 (quotients, &qp); - - ASSERT (qsize + hgcd->size <= hgcd->alloc); - hgcd_update_r (hgcd->row + 1, qp, qsize); - hgcd->size = hgcd_update_uv (hgcd->row + 1, hgcd->size, - qp, qsize); - ASSERT (hgcd->size < hgcd->alloc); - ASSERT (hgcd->row[3].rsize <= M + 1); - - if (hgcd->row[3].rsize <= M) - { -#if WANT_ASSERT - qstack_rotate (quotients, 0); -#endif - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); - return hgcd_jebelean (hgcd, M); - } - - HGCD_SWAP4_2 (hgcd->row); } - ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); + /* This really is the base case */ + for (;;) + { + /* Needs s+3 < n */ + nn = hgcd_step (n, ap, bp, s, M, tp); + if (!nn) + return success ? n : 0; - return hgcd_final (hgcd, M, quotients); + n = nn; + success = 1; + } } diff --git a/mpn/generic/hgcd2.c b/mpn/generic/hgcd2.c index 4ce579e8a..df6b94025 100644 --- a/mpn/generic/hgcd2.c +++ b/mpn/generic/hgcd2.c @@ -89,506 +89,201 @@ div2 (mp_ptr rp, return q; } #else /* GMP_NAIL_BITS != 0 */ -/* Two-limb division optimized for small quotients. Input words - include nails, which must be zero. */ -static inline mp_limb_t -div2 (mp_ptr rp, - mp_limb_t nh, mp_limb_t nl, - mp_limb_t dh, mp_limb_t dl) +/* Check all functions for nail support. */ +/* hgcd2 should be defined to take inputs including nail bits, and + produce a matrix with elements also including nail bits. This is + necessary, for the matrix elements to be useful with mpn_mul_1, + mpn_addmul_1 and friends. */ +#error Not implemented +#endif /* GMP_NAIL_BITS != 0 */ + +/* Reduces a,b until |a-b| fits in one limb + 1 bit. Constructs + matrix M. Returns 1 if we make progress, i.e. can perform at least + one subtraction. Otherwise returns zero.. */ + +/* FIXME: Possible optimizations: + + The div2 function starts with checking the most significant bit of + the numerator. We can maintained normalized operands here, call + hgcd with normalized operands only, which should make the code + simpler and possibly faster. + + Experiment with table lookups on the most significant bits. + + This function is also a candidate for assembler implementation. +*/ +int +mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, + struct hgcd_matrix1 *M) { - mp_limb_t q = 0; - int cnt; - - ASSERT_LIMB(nh); - ASSERT_LIMB(nl); - ASSERT_LIMB(dh); - ASSERT_LIMB(dl); - - /* FIXME: Always called with nh > 0 and dh >0. Then it should be - enough to look at the high limbs to select cnt. */ - for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++) - { - dh = (dh << 1) | (dl >> (GMP_NUMB_BITS - 1)); - dl = (dl << 1) & GMP_NUMB_MASK; - } - - while (cnt) + mp_limb_t u00, u01, u10, u11; + + if (ah < 2 || bh < 2) + return 0; + + if (ah > bh || (ah == bh && al > bl)) { - dl = (dh << (GMP_NUMB_BITS - 1)) | (dl >> 1); - dh = dh >> 1; - dl &= GMP_NUMB_MASK; - - q <<= 1; - if (nh > dh || (nh == dh && nl >= dl)) - { - /* FIXME: We could perhaps optimize this by unrolling the - loop 2^GMP_NUMB_BITS - 1 times? */ - nl -= dl; - nh -= dh; - nh -= (nl >> (GMP_LIMB_BITS - 1)); - nl &= GMP_NUMB_MASK; - - q |= 1; - } - cnt--; + sub_ddmmss (ah, al, ah, al, bh, bl); + if (ah < 2) + return 0; + + u00 = u01 = u11 = 1; + u10 = 0; } - ASSERT (nh < dh || (nh == dh && nl < dl)); - rp[0] = nl; - rp[1] = nh; + else + { + sub_ddmmss (bh, bl, bh, bl, ah, al); + if (bh < 2) + return 0; - return q; -} -#endif /* GMP_NAIL_BITS != 0 */ + u00 = u10 = u11 = 1; + u01 = 0; + } -#define SUB_2(w1,w0, x1,x0, y1,y0) \ - do { \ - ASSERT_LIMB (x1); \ - ASSERT_LIMB (x0); \ - ASSERT_LIMB (y1); \ - ASSERT_LIMB (y0); \ - \ - if (GMP_NAIL_BITS == 0) \ - sub_ddmmss (w1,w0, x1,x0, y1,y0); \ - else \ - { \ - mp_limb_t __w0, __c; \ - SUBC_LIMB (__c, __w0, x0, y0); \ - (w1) = ((x1) - (y1) - __c) & GMP_NUMB_MASK; \ - (w0) = __w0; \ - } \ - } while (0) - -static inline void -qstack_push_0 (struct qstack *stack) -{ - ASSERT_QSTACK (stack); + if (ah < bh) + goto subtract_a; - if (stack->size_next >= QSTACK_MAX_QUOTIENTS) - qstack_rotate (stack, 0); + for (;;) + { + ASSERT (ah >= bh); + if (ah == bh) + break; - stack->size[stack->size_next++] = 0; -} + /* Subtract a -= q b, and multiply M from the right by (1 q ; 0 + 1), affecting the second column of M. */ + ASSERT (ah > bh); + sub_ddmmss (ah, al, ah, al, bh, bl); -static inline void -qstack_push_1 (struct qstack *stack, mp_limb_t q) -{ - ASSERT (q >= 2); + if (ah < 2) + break; - ASSERT_QSTACK (stack); + if (ah <= bh) + { + /* Use q = 1 */ + u01 += u00; + u11 += u10; + } + else + { + mp_limb_t r[2]; + mp_limb_t q = div2 (r, ah, al, bh, bl); + al = r[0]; ah = r[1]; + if (ah < 2) + { + /* A is too small, but q is correct. */ + u01 += q * u00; + u11 += q * u10; + break; + } + q++; + u01 += q * u00; + u11 += q * u10; + } + subtract_a: + ASSERT (bh >= ah); + if (ah == bh) + break; - if (stack->limb_next >= stack->limb_alloc) - qstack_rotate (stack, 1); + /* Subtract b -= q a, and multiply M from the right by (1 0 ; q + 1), affecting the first column of M. */ + sub_ddmmss (bh, bl, bh, bl, ah, al); - else if (stack->size_next >= QSTACK_MAX_QUOTIENTS) - qstack_rotate (stack, 0); + if (bh < 2) + break; - stack->size[stack->size_next++] = 1; - stack->limb[stack->limb_next++] = q; + if (bh <= ah) + { + /* Use q = 1 */ + u00 += u01; + u10 += u11; + } + else + { + mp_limb_t r[2]; + mp_limb_t q = div2 (r, bh, bl, ah, al); + bl = r[0]; bh = r[1]; + if (bh < 2) + { + /* B is too small, but q is correct. */ + u00 += q * u01; + u10 += q * u11; + break; + } + q++; + u00 += q * u01; + u10 += q * u11; + } + } + M->u[0][0] = u00; M->u[0][1] = u01; + M->u[1][0] = u10; M->u[1][1] = u11; - ASSERT_QSTACK (stack); + return 1; } -/* Produce r_k from r_i and r_j, and push the corresponding - quotient. */ -#if __GMP_HAVE_TOKEN_PASTE -#define HGCD2_STEP(i, j, k) do { \ - SUB_2 (rh ## k, rl ## k, \ - rh ## i, rl ## i, \ - rh ## j, rl ## j); \ - \ - /* Could check here for the special case rh3 == 0, \ - but it's covered by the below condition as well */ \ - if ( rh ## k < rh ## j \ - || ( rh ## k == rh ## j \ - && rl ## k < rl ## j)) \ - { \ - /* Unit quotient */ \ - u ## k = u ## i + u ## j; \ - v ## k = v ## i + v ## j; \ - \ - if (quotients) \ - qstack_push_0 (quotients); \ - } \ - else \ - { \ - mp_limb_t r[2]; \ - mp_limb_t q = 1 + div2 (r, rh ## k, rl ## k, \ - rh ## j, rl ## j); \ - rl ## k = r[0]; rh ## k = r[1]; \ - u ## k = u ## i + q * u ## j; \ - v ## k = v ## i + q * v ## j; \ - \ - if (quotients) \ - qstack_push_1 (quotients, q); \ - } \ -} while (0) -#else /* ! __GMP_HAVE_TOKEN_PASTE */ -#define HGCD2_STEP(i, j, k) do { \ - SUB_2 (rh/**/k, rl/**/k, \ - rh/**/i, rl/**/i, \ - rh/**/j, rl/**/j); \ - \ - /* Could check here for the special case rh3 == 0, \ - but it's covered by the below condition as well */ \ - if ( rh/**/k < rh/**/j \ - || ( rh/**/k == rh/**/j \ - && rl/**/k < rl/**/j)) \ - { \ - /* Unit quotient */ \ - u/**/k = u/**/i + u/**/j; \ - v/**/k = v/**/i + v/**/j; \ - \ - if (quotients) \ - qstack_push_0 (quotients); \ - } \ - else \ - { \ - mp_limb_t r[2]; \ - mp_limb_t q = 1 + div2 (r, rh/**/k, rl/**/k, \ - rh/**/j, rl/**/j); \ - rl/**/k = r[0]; rh/**/k = r[1]; \ - u/**/k = u/**/i + q * u/**/j; \ - v/**/k = v/**/i + q * v/**/j; \ - \ - if (quotients) \ - qstack_push_1 (quotients, q); \ - } \ -} while (0) -#endif /* ! __GMP_HAVE_TOKEN_PASTE */ - -/* Repeatedly divides A by B, until the remainder is a single limb. - Stores cofactors in HGCD, and pushes the quotients on STACK (unless - STACK is NULL). On success, HGCD->row[0, 1, 2] correspond to - remainders that are larger than one limb, while HGCD->row[3] - correspond to a remainder that fit in a single limb. - - Return 0 on failure (if B or A mod B fits in a single limb). Return - 1 if r0 and r1 are correct, but we still make no progress because - r0 = A, r1 = B. - - Otherwise return 2, 3 or 4 depending on how many of the r:s that - satisfy Jebelean's criterion. */ -/* FIXME: There are two more micro optimizations that could be done to - this code: +/* Multiply (a;b) by M = (u00, u01; u10, u11). Needs n limbs of + temporary storage. Vector must have space for n + 1 limbs. */ +mp_size_t +mpn_hgcd_mul_matrix1_vector (struct hgcd_matrix1 *M, mp_size_t n, + mp_ptr ap, mp_ptr bp, mp_ptr tp) +{ + mp_limb_t ah, bh; - The div2 function starts with checking the most significant bit of - the numerator. When we call div2, that bit is know in advance for - all but the one or two first calls, so we could split div2 in two - functions, and call the right one. + /* Compute (a,b) <-- (u00 a + u10 b, u01 a + u11 b) as - We could also have two versions of this code, with and without the - quotient argument, to avoid checking if it's NULL in the middle of - the loop. */ + t = a + a *= u00 + a += u10 * b + b *= u11 + b += u01 * t + */ -int -mpn_hgcd2 (struct hgcd2 *hgcd, - mp_limb_t ah, mp_limb_t al, - mp_limb_t bh, mp_limb_t bl, - struct qstack *quotients) -{ - /* For all divisions, we special case q = 1, which accounts for - approximately 41% of the quotients for random numbers (Knuth, - TAOCP 4.5.3) */ - - /* Use scalar variables */ - mp_limb_t rh1, rl1, u1, v1; - mp_limb_t rh2, rl2, u2, v2; - mp_limb_t rh3, rl3, u3, v3; - - ASSERT_LIMB(ah); - ASSERT_LIMB(al); - ASSERT_LIMB(bh); - ASSERT_LIMB(bl); - ASSERT (ah > bh || (ah == bh && al >= bl)); - - if (bh == 0) - return 0; + /* This copying could be avoided if we let our caller swap some + * pointers. */ + MPN_COPY (tp, ap, n); - { - mp_limb_t rh0, rl0, u0, v0; - - /* Initialize first two rows */ - rh0 = ah; rl0 = al; u0 = 1; v0 = 0; - rh1 = bh; rl1 = bl; u1 = 0; v1 = 1; - - SUB_2 (rh2, rl2, rh0, rl0, rh1, rl1); - - if (rh2 == 0) - return 0; - - if (rh2 < rh1 || (rh2 == rh1 && rl2 < rl1)) - { - /* Unit quotient */ - v2 = 1; - - if (quotients) - qstack_push_0 (quotients); - } - else - { - mp_limb_t r[2]; - mp_limb_t q = 1 + div2 (r, rh2, rl2, rh1, rl1); - - rl2 = r[0]; rh2 = r[1]; - - if (rh2 == 0) - return 0; - - v2 = q; - - if (quotients) - qstack_push_1 (quotients, q); - } - - u2 = 1; - - /* The simple version of the loop is as follows: - | - | hgcd->sign = 0; - | for (;;) - | { - | (q, rh3, rl3]) = divmod (r1, r2); - | u[3] = u1 + q * u2; - | v[3] = v1 + q * v2; - | qstack_push_1 (quotients, q); - | - | if (rh3 == 0) - | break; - | - | HGCD2_SHIFT4_LEFT (hgcd->row); - | hgcd->sign = ~hgcd->sign; - | } - | - | But then we special case for q = 1, and unroll the loop four times - | to avoid data movement. */ - - for (;;) - { - HGCD2_STEP (1, 2, 3); - if (rh3 == 0) - { - hgcd->row[0].u = u0; hgcd->row[0].v = v0; - - hgcd->sign = 0; - - break; - } - HGCD2_STEP (2, 3, 0); - if (rh0 == 0) - { - hgcd->row[0].u = u1; hgcd->row[0].v = v1; - - rh1 = rh2; rl1 = rl2; u1 = u2; v1 = v2; - rh2 = rh3; rl2 = rl3; u2 = u3; v2 = v3; - rh3 = rh0; rl3 = rl0; u3 = u0; v3 = v0; - - hgcd->sign = -1; - break; - } - - HGCD2_STEP (3, 0, 1); - if (rh1 == 0) - { - hgcd->row[0].u = u2; hgcd->row[0].v = v2; - rh2 = rh0; rl2 = rl0; u2 = u0; v2 = v0; - - MP_LIMB_T_SWAP (rh1, rh3); MP_LIMB_T_SWAP (rl1, rl3); - MP_LIMB_T_SWAP ( u1, u3); MP_LIMB_T_SWAP ( v1, v3); - - hgcd->sign = 0; - break; - } - - HGCD2_STEP (0, 1, 2); - if (rh2 == 0) - { - hgcd->row[0].u = u3; hgcd->row[0].v = v3; - - rh3 = rh2; rl3 = rl2; u3 = u2; v3 = v2; - rh2 = rh1; rl2 = rl1; u2 = u1; v2 = v1; - rh1 = rh0; rl1 = rl0; u1 = u0; v1 = v0; - - hgcd->sign = -1; - break; - } - } - } - - ASSERT (rh1 != 0); - ASSERT (rh2 != 0); - ASSERT (rh3 == 0); - ASSERT (rh1 > rh2 || (rh1 == rh2 && rl1 > rl2)); - ASSERT (rh2 > rh3 || (rh2 == rh3 && rl2 > rl3)); - - /* Coefficients to be returned */ - hgcd->row[1].u = u1; hgcd->row[1].v = v1; - hgcd->row[2].u = u2; hgcd->row[2].v = v2; - hgcd->row[3].u = u3; hgcd->row[3].v = v3; - - /* Rows 1, 2 and 3 are used below, rh0, rl0, u0 and v0 are not. */ -#if GMP_NAIL_BITS == 0 - { - mp_limb_t sh; - mp_limb_t sl; - mp_limb_t th; - mp_limb_t tl; - - /* Check r2 */ - /* We always have r2 > u2, v2 */ - - if (hgcd->sign >= 0) - { - /* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */ - sl = u2 + u1; - sh = (sl < u1); - } - else - { - /* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */ - sl = v2 + v1; - sh = (sl < v1); - } - - sub_ddmmss (th, tl, rh1, rl1, rh2, rl2); - - if (th < sh || (th == sh && tl < sl)) - return 2 - (hgcd->row[0].v == 0); - - /* Check r3 */ - - if (hgcd->sign >= 0) - { - /* Check r3 >= max (-u3, -v3) = |u3| */ - if (rl3 < u3) - return 3; - - /* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/ - sl = v3 + v2; - sh = (sl < v2); - } - else - { - /* Check r3 >= max (-u3, -v3) = |v3| */ - if (rl3 < v3) - return 3; - - /* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */ - sl = u3 + u2; - sh = (sl < u2); - } - - sub_ddmmss (th, tl, rh2, rl2, 0, rl3); - - if (th < sh || (th == sh && tl < sl)) - return 3; - - return 4; - } -#else /* GMP_NAIL_BITS > 0 */ - { - mp_limb_t sl; - mp_limb_t th; - mp_limb_t tl; - - /* Check r2 */ - /* We always have r2 > u2, v2 */ - - if (hgcd->sign >= 0) - { - /* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */ - sl = u2 + u1; - } - else - { - /* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */ - sl = v2 + v1; - } - - tl = rl1 - rl2; - th = rh1 - rh2 - (tl >> (GMP_LIMB_BITS - 1)); - ASSERT_LIMB(th); - - if (th < (CNST_LIMB(1) << GMP_NAIL_BITS) - && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl) - return 2 - (hgcd->row[0].v == 0); - - /* Check r3 */ - - if (hgcd->sign >= 0) - { - /* Check r3 >= max (-u3, -v3) = |u3| */ - if (rl3 < u3) - return 3; - - /* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/ - sl = v3 + v2; - } - else - { - /* Check r3 >= max (-u3, -v3) = |v3| */ - if (rl3 < v3) - return 3; - - /* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */ - sl = u3 + u2; - } - - tl = rl2 - rl3; - th = rh2 - (tl >> (GMP_LIMB_BITS - 1)); - ASSERT_LIMB(th); - - if (th < (CNST_LIMB(1) << GMP_NAIL_BITS) - && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl) - return 3; - - return 4; - } -#endif /* GMP_NAIL_BITS > 0 */ + ah = mpn_mul_1 (ap, ap, n, M->u[0][0]); + ah += mpn_addmul_1 (ap, bp, n, M->u[1][0]); + + bh = mpn_mul_1 (bp, bp, n, M->u[1][1]); + bh += mpn_addmul_1 (bp, tp, n, M->u[0][1]); + + ap[n] = ah; + bp[n] = bh; + + n += (ap[n] | bp[n]) > 0; + return n; } +/* Multiply (a;b) by M^{-1} = (u11, -u01; -u10, u00) from the left. + Needs n limbs of temporary storage. */ mp_size_t -mpn_hgcd2_fix (mp_ptr rp, mp_size_t ralloc, - int sign, - mp_limb_t u, mp_srcptr ap, mp_size_t asize, - mp_limb_t v, mp_srcptr bp, mp_size_t bsize) +mpn_hgcd_mul_matrix1_inverse_vector (struct hgcd_matrix1 *M, mp_size_t n, + mp_ptr ap, mp_ptr bp, mp_ptr tp) { - mp_size_t rsize; - mp_limb_t cy; + mp_limb_t h0, h1; - ASSERT_LIMB(u); - ASSERT_LIMB(v); + /* Compute (a;b) <-- (u11 a - u01 b; -u10 a + u00 b) as - if (sign < 0) - { - MP_LIMB_T_SWAP (u,v); - MPN_SRCPTR_SWAP (ap, asize, bp, bsize); - } + t = a + a *= u11 + a -= u01 * b + b *= u00 + b -= u10 * t + */ - ASSERT (u > 0); + /* This copying could be avoided if we let our caller swap some + * pointers. */ + MPN_COPY (tp, ap, n); - ASSERT (asize <= ralloc); - rsize = asize; - cy = mpn_mul_1 (rp, ap, asize, u); - if (cy) - { - ASSERT (rsize < ralloc); - rp[rsize++] = cy; - } + h0 = mpn_mul_1 (ap, ap, n, M->u[1][1]); + h1 = mpn_submul_1 (ap, bp, n, M->u[0][1]); + ASSERT (h0 == h1); - if (v > 0) - { - ASSERT (bsize <= rsize); - cy = mpn_submul_1 (rp, bp, bsize, v); - if (cy) - { - ASSERT (bsize < rsize); - ASSERT_NOCARRY (mpn_sub_1 (rp + bsize, - rp + bsize, rsize - bsize, cy)); - } + h0 = mpn_mul_1 (bp, bp, n, M->u[0][0]); + h1 = mpn_submul_1 (bp, tp, n, M->u[1][0]); + ASSERT (h0 == h1); - MPN_NORMALIZE (rp, rsize); - } - return rsize; + n -= (ap[n-1] | bp[n-1]) == 0; + return n; } -#undef HGCD2_STEP diff --git a/mpn/generic/matrix22_mul.c b/mpn/generic/matrix22_mul.c new file mode 100644 index 000000000..0b8b61303 --- /dev/null +++ b/mpn/generic/matrix22_mul.c @@ -0,0 +1,254 @@ +/* matrix22_mul.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#define MUL(rp, ap, an, bp, bn) do { \ + if (an >= bn) \ + mpn_mul (rp, ap, an, bp, bn); \ + else \ + mpn_mul (rp, bp, bn, ap, an); \ +} while (0) + +/* Inputs are unsigned. */ +static int +abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + int c; + MPN_CMP (c, ap, bp, n); + if (c >= 0) + { + mpn_sub_n (rp, ap, bp, n); + return 0; + } + else + { + mpn_sub_n (rp, bp, ap, n); + return 1; + } +} + +static int +add_signed_n (mp_ptr rp, + mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n) +{ + if (as != bs) + return as ^ abs_sub_n (rp, ap, bp, n); + else + { + ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n)); + return as; + } +} + +mp_size_t +mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn) +{ + if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) + || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) + return 3*rn + 2*mn; + else + return 4*(rn + mn) + 5; +} + +/* Algorithm: + + / s0 \ / 1 0 0 0 \ / r0 \ + | s1 | | 0 1 0 0 | | r1 | + | s2 | | 0 0 1 1 | | r2 | + | s3 | = | -1 0 1 1 | \ r3 / + | s4 | | 1 0 -1 0 | + | s5 | | 1 1 -1 -1 | + \ s6 / \ 0 0 0 1 / + + / t0 \ / 1 0 0 0 \ / m0 \ + | t1 | | 0 0 1 0 | | m1 | + | t2 | | -1 1 0 0 | | m2 | + | t3 | = | 1 -1 0 1 | \ m3 / + | t4 | | 0 -1 0 1 | + | t5 | | 0 0 0 1 | + \ t6 / \ -1 1 1 -1 / + + / r0 \ / 1 1 0 0 0 0 0 \ / s0 * t0 \ + | r1 | = | 1 0 1 1 0 1 0 | | s1 * t1 | + | r2 | | 1 0 0 1 1 0 1 | | s2 * t2 | + \ r3 / \ 1 0 1 1 1 0 0 / | s3 * t3 | + | s4 * t4 | + | s5 * t5 | + \ s6 * t6 / +*/ + +/* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3). + * + * Resulting elements are of size up to rn + mn + 1. + * + * Temporary storage: 4 rn + 4 mn + 5. */ +void +mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, + mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, + mp_ptr tp) +{ + mp_ptr s2, s3, t2, t3, u0, u1; + int r2s, r3s, s3s, t2s, t3s, u0s, u1s; + s2 = tp; tp += rn; + s3 = tp; tp += rn + 1; + t2 = tp; tp += mn; + t3 = tp; tp += mn + 1; + u0 = tp; tp += rn + mn + 1; + u1 = tp; /* rn + mn + 2 */ + + MUL (u0, r0, rn, m0, mn); /* 0 */ + MUL (u1, r1, rn, m2, mn); /* 1 */ + + MPN_COPY (s2, r3, rn); + + r3[rn] = mpn_add_n (r3, r3, r2, rn); + r0[rn] = 0; + s3s = abs_sub_n (s3, r3, r0, rn + 1); + t2s = abs_sub_n (t2, m1, m0, mn); + if (t2s) + { + t3[mn] = mpn_add_n (t3, m3, t2, mn); + t3s = 0; + } + else + { + t3s = abs_sub_n (t3, m3, t2, mn); + t3[mn] = 0; + } + + r2s = abs_sub_n (r2, r0, r2, rn); + r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn); + + MUL(u1, s3, rn+1, t3, mn+1); /* 3 */ + u1s = s3s ^ t3s; + ASSERT (u1[rn+mn+1] == 0); + ASSERT (u1[rn+mn] < 4); + + if (u1s) + { + u0[rn+mn] = 0; + u0s = abs_sub_n (u0, u0, u1, rn + mn + 1); + } + else + { + u0[rn+mn] = u1[rn+mn] + mpn_add_n (u0, u0, u1, rn + mn); + u0s = 0; + } + MUL(u1, r3, rn + 1, t2, mn); /* 2 */ + u1s = t2s; + ASSERT (u1[rn+mn] < 2); + + u1s = add_signed_n (u1, u0, u0s, u1, u1s, rn + mn + 1); + + t2s = abs_sub_n (t2, m3, m1, mn); + if (s3s) + { + s3[rn] += mpn_add_n (s3, s3, r1, rn); + s3s = 0; + } + else if (s3[rn] > 0) + { + s3[rn] -= mpn_sub_n (s3, s3, r1, rn); + s3s = 1; + } + else + { + s3s = abs_sub_n (s3, r1, s3, rn); + } + MUL (r1, s3, rn+1, m3, mn); /* 5 */ + ASSERT_NOCARRY(add_signed_n (r1, r1, s3s, u1, u1s, rn + mn + 1)); + ASSERT (r1[rn + mn] < 2); + + MUL (r3, r2, rn, t2, mn); /* 4 */ + r3s = r2s ^ t2s; + r3[rn + mn] = 0; + u0s = add_signed_n (u0, u0, u0s, r3, r3s, rn + mn + 1); + ASSERT_NOCARRY (add_signed_n (r3, r3, r3s, u1, u1s, rn + mn + 1)); + ASSERT (r3[rn + mn] < 2); + + if (t3s) + { + t3[mn] += mpn_add_n (t3, m2, t3, mn); + t3s = 0; + } + else if (t3[mn] > 0) + { + t3[mn] -= mpn_sub_n (t3, t3, m2, mn); + t3s = 1; + } + else + { + t3s = abs_sub_n (t3, m2, t3, mn); + } + MUL (r2, s2, rn, t3, mn + 1); /* 6 */ + + ASSERT_NOCARRY (add_signed_n (r2, r2, t3s, u0, u0s, rn + mn + 1)); + ASSERT (r2[rn + mn] < 2); +} + +void +mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, + mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, + mp_ptr tp) +{ + if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) + || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) + { + mp_ptr p0, p1; + unsigned i; + + /* Temporary storage: 3 rn + 2 mn */ + p0 = tp + rn; + p1 = p0 + rn + mn; + + for (i = 0; i < 2; i++) + { + MPN_COPY (tp, r0, rn); + + if (rn >= mn) + { + mpn_mul (p0, r0, rn, m0, mn); + mpn_mul (p1, r1, rn, m3, mn); + mpn_mul (r0, r1, rn, m2, mn); + mpn_mul (r1, tp, rn, m1, mn); + } + else + { + mpn_mul (p0, m0, mn, r0, rn); + mpn_mul (p1, m3, mn, r1, rn); + mpn_mul (r0, m2, mn, r1, rn); + mpn_mul (r1, m1, mn, tp, rn); + } + r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn); + r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn); + + r0 = r2; r1 = r3; + } + } + else + mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn, + m0, m1, m2, m3, mn, tp); +} diff --git a/mpn/ia64/gmp-mparam.h b/mpn/ia64/gmp-mparam.h index 8dd018237..22a8cfff8 100644 --- a/mpn/ia64/gmp-mparam.h +++ b/mpn/ia64/gmp-mparam.h @@ -37,9 +37,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 72 #define POWM_THRESHOLD 295 -#define HGCD_SCHOENHAGE_THRESHOLD 191 +#define HGCD_THRESHOLD 191 #define GCD_ACCEL_THRESHOLD 10 -#define GCD_SCHOENHAGE_THRESHOLD 336 +#define GCD_DC_THRESHOLD 336 #define GCDEXT_SCHOENHAGE_THRESHOLD 649 #define JACOBI_BASE_METHOD 1 diff --git a/mpn/m68k/gmp-mparam.h b/mpn/m68k/gmp-mparam.h index c18bc5a63..c62304653 100644 --- a/mpn/m68k/gmp-mparam.h +++ b/mpn/m68k/gmp-mparam.h @@ -37,10 +37,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 55 #define POWM_THRESHOLD 65 -#define HGCD_SCHOENHAGE_THRESHOLD 116 +#define HGCD_THRESHOLD 116 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 590 -#define GCDEXT_THRESHOLD 35 +#define GCD_DC_THRESHOLD 590 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ diff --git a/mpn/minithres/gmp-mparam.h b/mpn/minithres/gmp-mparam.h index 7586b7a0f..31b74337b 100644 --- a/mpn/minithres/gmp-mparam.h +++ b/mpn/minithres/gmp-mparam.h @@ -33,9 +33,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 6 #define POWM_THRESHOLD 4 -#define HGCD_SCHOENHAGE_THRESHOLD 10 +#define HGCD_THRESHOLD 10 #define GCD_ACCEL_THRESHOLD 2 -#define GCD_SCHOENHAGE_THRESHOLD 20 +#define GCD_DC_THRESHOLD 20 #define GCDEXT_SCHOENHAGE_THRESHOLD 20 #define JACOBI_BASE_METHOD 1 diff --git a/mpn/mips32/gmp-mparam.h b/mpn/mips32/gmp-mparam.h index a5b736de3..d86fd3f01 100644 --- a/mpn/mips32/gmp-mparam.h +++ b/mpn/mips32/gmp-mparam.h @@ -37,7 +37,6 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define POWM_THRESHOLD 78 #define GCD_ACCEL_THRESHOLD 3 -#define GCDEXT_THRESHOLD 18 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 0 /* always */ diff --git a/mpn/mips64/gmp-mparam.h b/mpn/mips64/gmp-mparam.h index 23b012149..d189e895c 100644 --- a/mpn/mips64/gmp-mparam.h +++ b/mpn/mips64/gmp-mparam.h @@ -36,10 +36,9 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 53 #define POWM_THRESHOLD 61 -#define HGCD_SCHOENHAGE_THRESHOLD 116 +#define HGCD_THRESHOLD 116 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 492 -#define GCDEXT_THRESHOLD 0 /* always */ +#define GCD_DC_THRESHOLD 492 #define JACOBI_BASE_METHOD 2 #define MOD_1_NORM_THRESHOLD 0 /* always */ diff --git a/mpn/pa32/gmp-mparam.h b/mpn/pa32/gmp-mparam.h index 3c6d36c57..005539c0d 100644 --- a/mpn/pa32/gmp-mparam.h +++ b/mpn/pa32/gmp-mparam.h @@ -49,6 +49,5 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #ifndef GCD_ACCEL_THRESHOLD #define GCD_ACCEL_THRESHOLD 46 #endif -#ifndef GCDEXT_THRESHOLD #define GCDEXT_THRESHOLD 33 #endif diff --git a/mpn/pa32/hppa1_1/gmp-mparam.h b/mpn/pa32/hppa1_1/gmp-mparam.h index d3d6d4436..5ced74548 100644 --- a/mpn/pa32/hppa1_1/gmp-mparam.h +++ b/mpn/pa32/hppa1_1/gmp-mparam.h @@ -34,10 +34,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 84 #define POWM_THRESHOLD 166 -#define HGCD_SCHOENHAGE_THRESHOLD 231 +#define HGCD_THRESHOLD 231 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 823 -#define GCDEXT_THRESHOLD 0 /* always */ +#define GCD_DC_THRESHOLD 823 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 5 diff --git a/mpn/pa32/hppa2_0/gmp-mparam.h b/mpn/pa32/hppa2_0/gmp-mparam.h index 29ea97506..f5667840a 100644 --- a/mpn/pa32/hppa2_0/gmp-mparam.h +++ b/mpn/pa32/hppa2_0/gmp-mparam.h @@ -35,7 +35,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define POWM_THRESHOLD 166 #define GCD_ACCEL_THRESHOLD 4 -#define GCDEXT_THRESHOLD 0 #define DIVREM_1_NORM_THRESHOLD 4 #define DIVREM_1_UNNORM_THRESHOLD 6 diff --git a/mpn/pa64/gmp-mparam.h b/mpn/pa64/gmp-mparam.h index 537da5f71..e9d058f6b 100644 --- a/mpn/pa64/gmp-mparam.h +++ b/mpn/pa64/gmp-mparam.h @@ -39,10 +39,9 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 123 #define POWM_THRESHOLD 212 -#define HGCD_SCHOENHAGE_THRESHOLD 292 +#define HGCD_THRESHOLD 292 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 1498 -#define GCDEXT_THRESHOLD 0 /* always */ +#define GCD_DC_THRESHOLD 1498 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 0 /* always */ diff --git a/mpn/power/gmp-mparam.h b/mpn/power/gmp-mparam.h index 8cc6bf0c7..f9b10e6a4 100644 --- a/mpn/power/gmp-mparam.h +++ b/mpn/power/gmp-mparam.h @@ -30,10 +30,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 36 #define POWM_THRESHOLD 69 -#define HGCD_SCHOENHAGE_THRESHOLD 97 +#define HGCD_THRESHOLD 97 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 590 -#define GCDEXT_THRESHOLD 41 +#define GCD_DC_THRESHOLD 590 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 12 diff --git a/mpn/powerpc32/750/gmp-mparam.h b/mpn/powerpc32/750/gmp-mparam.h index f20fd665f..d604e6ed4 100644 --- a/mpn/powerpc32/750/gmp-mparam.h +++ b/mpn/powerpc32/750/gmp-mparam.h @@ -35,10 +35,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 35 #define POWM_THRESHOLD 48 -#define HGCD_SCHOENHAGE_THRESHOLD 93 +#define HGCD_THRESHOLD 93 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 676 -#define GCDEXT_THRESHOLD 31 +#define GCD_DC_THRESHOLD 676 #define JACOBI_BASE_METHOD 1 #define DIVREM_1_NORM_THRESHOLD 0 /* always */ diff --git a/mpn/powerpc32/gmp-mparam.h b/mpn/powerpc32/gmp-mparam.h index 0387e2fb7..a77c98e8a 100644 --- a/mpn/powerpc32/gmp-mparam.h +++ b/mpn/powerpc32/gmp-mparam.h @@ -41,10 +41,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 45 #define POWM_THRESHOLD 89 -#define HGCD_SCHOENHAGE_THRESHOLD 145 +#define HGCD_THRESHOLD 145 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 738 -#define GCDEXT_THRESHOLD 16 +#define GCD_DC_THRESHOLD 738 #define JACOBI_BASE_METHOD 1 #define DIVREM_1_NORM_THRESHOLD 0 /* always */ diff --git a/mpn/powerpc64/gmp-mparam.h b/mpn/powerpc64/gmp-mparam.h index 6fe8a8d40..e0ab478e3 100644 --- a/mpn/powerpc64/gmp-mparam.h +++ b/mpn/powerpc64/gmp-mparam.h @@ -37,10 +37,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 28 #define POWM_THRESHOLD 40 -#define HGCD_SCHOENHAGE_THRESHOLD 56 +#define HGCD_THRESHOLD 56 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 408 -#define GCDEXT_THRESHOLD 151 +#define GCD_DC_THRESHOLD 408 #define JACOBI_BASE_METHOD 1 #define MOD_1_NORM_THRESHOLD 0 /* always */ diff --git a/mpn/s390/gmp-mparam.h b/mpn/s390/gmp-mparam.h index b09191456..d73884667 100644 --- a/mpn/s390/gmp-mparam.h +++ b/mpn/s390/gmp-mparam.h @@ -35,7 +35,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define POWM_THRESHOLD 63 #define GCD_ACCEL_THRESHOLD 3 -#define GCDEXT_THRESHOLD 28 #define DIVREM_1_NORM_THRESHOLD 0 #define DIVREM_1_UNNORM_THRESHOLD 5 diff --git a/mpn/sparc32/gmp-mparam.h b/mpn/sparc32/gmp-mparam.h index d275da51a..3bc6cd6db 100644 --- a/mpn/sparc32/gmp-mparam.h +++ b/mpn/sparc32/gmp-mparam.h @@ -32,7 +32,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define POWM_THRESHOLD 28 #define GCD_ACCEL_THRESHOLD 3 -#define GCDEXT_THRESHOLD 0 /* always */ #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 3 diff --git a/mpn/sparc32/v8/gmp-mparam.h b/mpn/sparc32/v8/gmp-mparam.h index fde006e08..f042c19e5 100644 --- a/mpn/sparc32/v8/gmp-mparam.h +++ b/mpn/sparc32/v8/gmp-mparam.h @@ -35,10 +35,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 24 #define POWM_THRESHOLD 38 -#define HGCD_SCHOENHAGE_THRESHOLD 69 +#define HGCD_THRESHOLD 69 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 498 -#define GCDEXT_THRESHOLD 0 /* always */ +#define GCD_DC_THRESHOLD 498 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 6 diff --git a/mpn/sparc32/v8/supersparc/gmp-mparam.h b/mpn/sparc32/v8/supersparc/gmp-mparam.h index c6f2d83eb..feb90ef40 100644 --- a/mpn/sparc32/v8/supersparc/gmp-mparam.h +++ b/mpn/sparc32/v8/supersparc/gmp-mparam.h @@ -35,10 +35,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 26 #define POWM_THRESHOLD 79 -#define HGCD_SCHOENHAGE_THRESHOLD 97 +#define HGCD_THRESHOLD 97 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 470 -#define GCDEXT_THRESHOLD 14 +#define GCD_DC_THRESHOLD 470 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 0 /* always */ diff --git a/mpn/sparc32/v9/gmp-mparam.h b/mpn/sparc32/v9/gmp-mparam.h index 2f11e400e..3d48d743b 100644 --- a/mpn/sparc32/v9/gmp-mparam.h +++ b/mpn/sparc32/v9/gmp-mparam.h @@ -34,10 +34,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 125 #define POWM_THRESHOLD 150 -#define HGCD_SCHOENHAGE_THRESHOLD 210 +#define HGCD_THRESHOLD 210 #define GCD_ACCEL_THRESHOLD 4 -#define GCD_SCHOENHAGE_THRESHOLD 1291 -#define GCDEXT_THRESHOLD 9 +#define GCD_DC_THRESHOLD 1291 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ diff --git a/mpn/sparc64/gmp-mparam.h b/mpn/sparc64/gmp-mparam.h index 4bceda1db..9c59e698f 100644 --- a/mpn/sparc64/gmp-mparam.h +++ b/mpn/sparc64/gmp-mparam.h @@ -44,7 +44,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define POWM_THRESHOLD 85 #define GCD_ACCEL_THRESHOLD 3 -#define GCDEXT_THRESHOLD 20 #define JACOBI_BASE_METHOD 2 #define DIVREM_1_NORM_THRESHOLD 3 diff --git a/mpn/vax/gmp-mparam.h b/mpn/vax/gmp-mparam.h index 4b7a2156d..ea262ddc4 100644 --- a/mpn/vax/gmp-mparam.h +++ b/mpn/vax/gmp-mparam.h @@ -32,7 +32,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ /* #define POWM_THRESHOLD */ /* #define GCD_ACCEL_THRESHOLD */ -#define GCDEXT_THRESHOLD 40 /* #define JACOBI_BASE_METHOD */ /* #define DIVREM_1_NORM_THRESHOLD */ diff --git a/mpn/x86/i486/gmp-mparam.h b/mpn/x86/i486/gmp-mparam.h index f064a3e69..aaddea9f1 100644 --- a/mpn/x86/i486/gmp-mparam.h +++ b/mpn/x86/i486/gmp-mparam.h @@ -37,7 +37,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define POWM_THRESHOLD 38 #define GCD_ACCEL_THRESHOLD 3 -#define GCDEXT_THRESHOLD 55 #define JACOBI_BASE_METHOD 2 #define USE_PREINV_DIVREM_1 0 diff --git a/mpn/x86/k6/gmp-mparam.h b/mpn/x86/k6/gmp-mparam.h index fc3303880..dbf8c59c8 100644 --- a/mpn/x86/k6/gmp-mparam.h +++ b/mpn/x86/k6/gmp-mparam.h @@ -37,10 +37,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 76 #define POWM_THRESHOLD 97 -#define HGCD_SCHOENHAGE_THRESHOLD 242 +#define HGCD_THRESHOLD 242 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 1243 -#define GCDEXT_THRESHOLD 40 +#define GCD_DC_THRESHOLD 1243 #define JACOBI_BASE_METHOD 2 #define USE_PREINV_DIVREM_1 0 diff --git a/mpn/x86/k7/gmp-mparam.h b/mpn/x86/k7/gmp-mparam.h index a3927784d..5c5c1195e 100644 --- a/mpn/x86/k7/gmp-mparam.h +++ b/mpn/x86/k7/gmp-mparam.h @@ -41,9 +41,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 84 #define POWM_THRESHOLD 134 -#define HGCD_SCHOENHAGE_THRESHOLD 220 +#define HGCD_THRESHOLD 220 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 908 +#define GCD_DC_THRESHOLD 908 #define GCDEXT_SCHOENHAGE_THRESHOLD 683 #define JACOBI_BASE_METHOD 1 diff --git a/mpn/x86/p6/gmp-mparam.h b/mpn/x86/p6/gmp-mparam.h index 217facab4..a85c50027 100644 --- a/mpn/x86/p6/gmp-mparam.h +++ b/mpn/x86/p6/gmp-mparam.h @@ -45,7 +45,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define POWM_THRESHOLD 131 #define GCD_ACCEL_THRESHOLD 3 -#define GCDEXT_THRESHOLD 33 #define JACOBI_BASE_METHOD 1 #define USE_PREINV_DIVREM_1 0 diff --git a/mpn/x86/p6/mmx/gmp-mparam.h b/mpn/x86/p6/mmx/gmp-mparam.h index 1456b53a1..c1fa872f0 100644 --- a/mpn/x86/p6/mmx/gmp-mparam.h +++ b/mpn/x86/p6/mmx/gmp-mparam.h @@ -54,9 +54,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DC_BDIV_Q_THRESHOLD 10 #define DIVEXACT_JEB_THRESHOLD 48 -#define HGCD_SCHOENHAGE_THRESHOLD 145 +#define HGCD_THRESHOLD 145 #define GCD_ACCEL_THRESHOLD 5 -#define GCD_SCHOENHAGE_THRESHOLD 537 +#define GCD_DC_THRESHOLD 537 #define GCDEXT_SCHOENHAGE_THRESHOLD 948 #define JACOBI_BASE_METHOD 1 diff --git a/mpn/x86/pentium/gmp-mparam.h b/mpn/x86/pentium/gmp-mparam.h index c7f398da8..5c49c4e3c 100644 --- a/mpn/x86/pentium/gmp-mparam.h +++ b/mpn/x86/pentium/gmp-mparam.h @@ -42,10 +42,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 52 #define POWM_THRESHOLD 77 -#define HGCD_SCHOENHAGE_THRESHOLD 121 +#define HGCD_THRESHOLD 121 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 615 -#define GCDEXT_THRESHOLD 13 +#define GCD_DC_THRESHOLD 615 #define JACOBI_BASE_METHOD 2 #define USE_PREINV_DIVREM_1 0 diff --git a/mpn/x86/pentium/mmx/gmp-mparam.h b/mpn/x86/pentium/mmx/gmp-mparam.h index 40eaecd6f..aae5fec48 100644 --- a/mpn/x86/pentium/mmx/gmp-mparam.h +++ b/mpn/x86/pentium/mmx/gmp-mparam.h @@ -42,9 +42,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DIV_DC_THRESHOLD 37 #define POWM_THRESHOLD 73 -#define HGCD_SCHOENHAGE_THRESHOLD 97 +#define HGCD_THRESHOLD 97 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 849 +#define GCD_DC_THRESHOLD 849 #define GCDEXT_THRESHOLD 14 #define JACOBI_BASE_METHOD 2 diff --git a/mpn/x86/pentium4/sse2/gmp-mparam.h b/mpn/x86/pentium4/sse2/gmp-mparam.h index 113356dcc..3ad7a93a1 100644 --- a/mpn/x86/pentium4/sse2/gmp-mparam.h +++ b/mpn/x86/pentium4/sse2/gmp-mparam.h @@ -48,9 +48,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DC_BDIV_Q_THRESHOLD 10 #define DIVEXACT_JEB_THRESHOLD 80 -#define HGCD_SCHOENHAGE_THRESHOLD 101 +#define HGCD_THRESHOLD 101 #define GCD_ACCEL_THRESHOLD 6 -#define GCD_SCHOENHAGE_THRESHOLD 341 +#define GCD_DC_THRESHOLD 341 #define GCDEXT_SCHOENHAGE_THRESHOLD 375 #define JACOBI_BASE_METHOD 1 diff --git a/mpn/x86_64/core2/gmp-mparam.h b/mpn/x86_64/core2/gmp-mparam.h index e4a4ea2e8..44e3af47d 100644 --- a/mpn/x86_64/core2/gmp-mparam.h +++ b/mpn/x86_64/core2/gmp-mparam.h @@ -49,10 +49,10 @@ MA 02110-1301, USA. */ #define DC_BDIV_Q_THRESHOLD 10 #define DIVEXACT_JEB_THRESHOLD 40 -#define HGCD_SCHOENHAGE_THRESHOLD 191 -#define GCD_ACCEL_THRESHOLD 5 -#define GCD_SCHOENHAGE_THRESHOLD 948 -#define GCDEXT_SCHOENHAGE_THRESHOLD 254 +#define MATRIX22_STRASSEN_THRESHOLD 25 +#define HGCD_THRESHOLD 191 +#define GCD_DC_THRESHOLD 948 +#define GCDEXT_DC_THRESHOLD 254 #define JACOBI_BASE_METHOD 1 #define MOD_1_NORM_THRESHOLD 0 /* always */ diff --git a/mpn/x86_64/gmp-mparam.h b/mpn/x86_64/gmp-mparam.h index 3c3d94315..fc2cd275c 100644 --- a/mpn/x86_64/gmp-mparam.h +++ b/mpn/x86_64/gmp-mparam.h @@ -47,10 +47,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DC_BDIV_Q_THRESHOLD 10 #define DIVEXACT_JEB_THRESHOLD 50 -#define HGCD_SCHOENHAGE_THRESHOLD 145 +#define MATRIX22_STRASSEN_THRESHOLD 22 +#define HGCD_THRESHOLD 111 #define GCD_ACCEL_THRESHOLD 3 -#define GCD_SCHOENHAGE_THRESHOLD 445 -#define GCDEXT_SCHOENHAGE_THRESHOLD 713 +#define GCD_DC_THRESHOLD 412 +#define GCDEXT_DC_THRESHOLD 390 #define JACOBI_BASE_METHOD 1 #define MOD_1_NORM_THRESHOLD 0 /* always */ diff --git a/mpn/x86_64/pentium4/gmp-mparam.h b/mpn/x86_64/pentium4/gmp-mparam.h index e1c56bcac..afb106f59 100644 --- a/mpn/x86_64/pentium4/gmp-mparam.h +++ b/mpn/x86_64/pentium4/gmp-mparam.h @@ -54,9 +54,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define DC_BDIV_Q_THRESHOLD 10 #define DIVEXACT_JEB_THRESHOLD 27 -#define HGCD_SCHOENHAGE_THRESHOLD 133 +#define HGCD_THRESHOLD 133 #define GCD_ACCEL_THRESHOLD 10 -#define GCD_SCHOENHAGE_THRESHOLD 792 +#define GCD_DC_THRESHOLD 792 #define GCDEXT_SCHOENHAGE_THRESHOLD 339 #define JACOBI_BASE_METHOD 1 diff --git a/tests/mpn/Makefile.am b/tests/mpn/Makefile.am index decce7182..f67138a6c 100644 --- a/tests/mpn/Makefile.am +++ b/tests/mpn/Makefile.am @@ -22,7 +22,7 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/tests LDADD = $(top_builddir)/tests/libtests.la $(top_builddir)/libgmp.la check_PROGRAMS = t-asmtype t-aors_1 t-divrem_1 t-fat t-get_d \ - t-instrument t-iord_u t-mp_bases t-perfsqr t-scan t-hgcd + t-instrument t-iord_u t-mp_bases t-perfsqr t-scan t-hgcd t-matrix22 TESTS = $(check_PROGRAMS) diff --git a/tests/mpn/t-hgcd.c b/tests/mpn/t-hgcd.c index 94d4ca95a..2615fd679 100644 --- a/tests/mpn/t-hgcd.c +++ b/tests/mpn/t-hgcd.c @@ -25,7 +25,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #include "gmp-impl.h" #include "tests.h" -static int one_test __GMP_PROTO ((mpz_t, mpz_t, int)); +static mp_size_t one_test __GMP_PROTO ((mpz_t, mpz_t, int)); static void debug_mp __GMP_PROTO ((mpz_t, int)); #define MIN_OPERAND_SIZE 2 @@ -34,31 +34,26 @@ static void debug_mp __GMP_PROTO ((mpz_t, int)); struct value { int res; const char *a; const char *b; }; static const struct value hgcd_values[] = { #if GMP_NUMB_BITS == 32 - { 4, + { 5, "0x1bddff867272a9296ac493c251d7f46f09a5591fe", "0xb55930a2a68a916450a7de006031068c5ddb0e5c" }, { 4, "0x2f0ece5b1ee9c15e132a01d55768dc13", "0x1c6f4fd9873cdb24466e6d03e1cc66e7" }, - { 4, "0x7FFFFC003FFFFFFFFFC5", "0x3FFFFE001FFFFFFFFFE3"}, + { 3, "0x7FFFFC003FFFFFFFFFC5", "0x3FFFFE001FFFFFFFFFE3"}, #endif { -1, NULL, NULL } }; struct hgcd_ref { - /* Sign here, u and v are stored as absolute values */ - int sign; - - mpz_t r[4]; - mpz_t u[4]; - mpz_t v[4]; + mpz_t m[2][2]; }; static void hgcd_ref_init __GMP_PROTO ((struct hgcd_ref *hgcd)); static void hgcd_ref_clear __GMP_PROTO ((struct hgcd_ref *hgcd)); -static int hgcd_ref __GMP_PROTO ((struct hgcd_ref *hgcd, const mpz_t a, const mpz_t b)); -static int hgcd_ref_equal __GMP_PROTO ((const struct hgcd *hgcd, const struct hgcd_ref *ref)); +static int hgcd_ref __GMP_PROTO ((struct hgcd_ref *hgcd, mpz_t a, mpz_t b)); +static int hgcd_ref_equal __GMP_PROTO ((const struct hgcd_matrix *hgcd, const struct hgcd_ref *ref)); int main (int argc, char **argv) @@ -80,7 +75,7 @@ main (int argc, char **argv) for (i = 0; hgcd_values[i].res >= 0; i++) { - int res; + mp_size_t res; mpz_set_str (op1, hgcd_values[i].a, 0); mpz_set_str (op2, hgcd_values[i].b, 0); @@ -117,7 +112,7 @@ main (int argc, char **argv) if (mpz_cmp (op1, op2) < 0) mpz_swap (op1, op2); - if (mpz_size(op1) > 0) + if (mpz_size (op1) > 0) one_test (op1, op2, i); /* Generate a division chain backwards, allowing otherwise @@ -133,7 +128,7 @@ main (int argc, char **argv) chain_len = 1000000; #else mpz_urandomb (bs, rands, 32); - chain_len = mpz_get_ui (bs) % (GMP_NUMB_BITS * GCD_SCHOENHAGE_THRESHOLD / 256); + chain_len = mpz_get_ui (bs) % (GMP_NUMB_BITS * GCD_DC_THRESHOLD / 256); #endif for (j = 0; j < chain_len; j++) @@ -146,7 +141,7 @@ main (int argc, char **argv) mpz_add (op1, op1, temp1); /* Don't generate overly huge operands. */ - if (SIZ (op1) > 3 * GCD_SCHOENHAGE_THRESHOLD) + if (SIZ (op1) > 3 * GCD_DC_THRESHOLD) break; mpz_urandomb (bs, rands, 32); @@ -157,13 +152,13 @@ main (int argc, char **argv) mpz_add (op2, op2, temp1); /* Don't generate overly huge operands. */ - if (SIZ (op2) > 3 * GCD_SCHOENHAGE_THRESHOLD) + if (SIZ (op2) > 3 * GCD_DC_THRESHOLD) break; } if (mpz_cmp (op1, op2) < 0) mpz_swap (op1, op2); - if (mpz_size(op1) > 0) + if (mpz_size (op1) > 0) one_test (op1, op2, i); } @@ -177,33 +172,37 @@ debug_mp (mpz_t x, int base) } static int +mpz_mpn_equal (const mpz_t a, mp_srcptr bp, mp_size_t bsize); + +static mp_size_t one_test (mpz_t a, mpz_t b, int i) { - struct hgcd hgcd; + struct hgcd_matrix hgcd; struct hgcd_ref ref; - struct qstack quotients; - int res[2]; + + mpz_t ref_r0; + mpz_t ref_r1; + mpz_t hgcd_r0; + mpz_t hgcd_r1; + + mp_size_t res[2]; mp_size_t asize; mp_size_t bsize; mp_size_t hgcd_init_scratch; - mp_size_t qstack_scratch; mp_size_t hgcd_scratch; mp_ptr hgcd_init_tp; - mp_ptr qstack_tp; mp_ptr hgcd_tp; asize = a->_mp_size; bsize = b->_mp_size; - hgcd_init_scratch = mpn_hgcd_init_itch (asize); - hgcd_init_tp = refmpn_malloc_limbs (hgcd_init_scratch); - mpn_hgcd_init (&hgcd, asize, hgcd_init_tp); + ASSERT (asize >= bsize); - qstack_scratch = qstack_itch (asize); - qstack_tp = refmpn_malloc_limbs (qstack_scratch); - qstack_init ("ients, asize, qstack_tp, qstack_scratch); + hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (asize); + hgcd_init_tp = refmpn_malloc_limbs (hgcd_init_scratch); + mpn_hgcd_matrix_init (&hgcd, asize, hgcd_init_tp); hgcd_scratch = mpn_hgcd_itch (asize); hgcd_tp = refmpn_malloc_limbs (hgcd_scratch); @@ -221,28 +220,37 @@ one_test (mpz_t a, mpz_t b, int i) #endif hgcd_ref_init (&ref); - res[0] = hgcd_ref (&ref, a, b); - res[1] = mpn_hgcd (&hgcd, - a->_mp_d, asize, - b->_mp_d, bsize, - "ients, - hgcd_tp, hgcd_scratch); + mpz_init_set (ref_r0, a); + mpz_init_set (ref_r1, b); + res[0] = hgcd_ref (&ref, ref_r0, ref_r1); + + mpz_init_set (hgcd_r0, a); + mpz_init_set (hgcd_r1, b); + if (bsize < asize) + { + _mpz_realloc (hgcd_r1, asize); + MPN_ZERO (hgcd_r1->_mp_d + bsize, asize - bsize); + } + res[1] = mpn_hgcd (hgcd_r0->_mp_d, + hgcd_r1->_mp_d, + asize, + &hgcd, hgcd_tp); if (res[0] != res[1]) { fprintf (stderr, "ERROR in test %d\n", i); - fprintf (stderr, "Different return code from hgcd and hgcd_ref\n"); + fprintf (stderr, "Different return value from hgcd and hgcd_ref\n"); fprintf (stderr, "op1="); debug_mp (a, -16); fprintf (stderr, "op2="); debug_mp (b, -16); - fprintf (stderr, "hgcd_ref: %d\n", res[0]); - fprintf (stderr, "mpn_hgcd: %d\n", res[1]); + fprintf (stderr, "hgcd_ref: %ld\n", (long) res[0]); + fprintf (stderr, "mpn_hgcd: %ld\n", (long) res[1]); abort (); } if (res[0] > 0) { - ASSERT_HGCD (&hgcd, a->_mp_d, asize, b->_mp_d, bsize, 0, 4); - - if (!hgcd_ref_equal (&hgcd, &ref)) + if (!hgcd_ref_equal (&hgcd, &ref) + || !mpz_mpn_equal (ref_r0, hgcd_r0->_mp_d, res[1]) + || !mpz_mpn_equal (ref_r1, hgcd_r1->_mp_d, res[1])) { fprintf (stderr, "ERROR in test %d\n", i); fprintf (stderr, "mpn_hgcd and hgcd_ref returned different values\n"); @@ -253,9 +261,12 @@ one_test (mpz_t a, mpz_t b, int i) } refmpn_free_limbs (hgcd_init_tp); - refmpn_free_limbs (qstack_tp); refmpn_free_limbs (hgcd_tp); hgcd_ref_clear (&ref); + mpz_clear (ref_r0); + mpz_clear (ref_r1); + mpz_clear (hgcd_r0); + mpz_clear (hgcd_r1); return res[0]; } @@ -264,11 +275,11 @@ static void hgcd_ref_init (struct hgcd_ref *hgcd) { unsigned i; - for (i = 0; i<4; i++) + for (i = 0; i<2; i++) { - mpz_init (hgcd->r[i]); - mpz_init (hgcd->u[i]); - mpz_init (hgcd->v[i]); + unsigned j; + for (j = 0; j<2; j++) + mpz_init (hgcd->m[i][j]); } } @@ -276,137 +287,91 @@ static void hgcd_ref_clear (struct hgcd_ref *hgcd) { unsigned i; - for (i = 0; i<4; i++) + for (i = 0; i<2; i++) { - mpz_clear (hgcd->r[i]); - mpz_clear (hgcd->u[i]); - mpz_clear (hgcd->v[i]); + unsigned j; + for (j = 0; j<2; j++) + mpz_clear (hgcd->m[i][j]); } } + static int -hgcd_ref (struct hgcd_ref *hgcd, const mpz_t a, const mpz_t b) +sdiv_qr (mpz_t q, mpz_t r, mp_size_t s, const mpz_t a, const mpz_t b) { - mp_size_t M = (a->_mp_size + 1) / 2; - mpz_t t; + mpz_fdiv_qr (q, r, a, b); + if (mpz_size (r) <= s) + { + mpz_add (r, r, b); + mpz_sub_ui (q, q, 1); + } + + return (mpz_sgn (q) > 0); +} + +static int +hgcd_ref (struct hgcd_ref *hgcd, mpz_t a, mpz_t b) +{ + mp_size_t n = MAX (mpz_size (a), mpz_size (b)); + mp_size_t s = n/2 + 1; + mp_size_t asize; + mp_size_t bsize; mpz_t q; int res; - if (mpz_size(b) <= M) + if (mpz_size (a) <= s || mpz_size (b) <= s) return 0; - mpz_init (q); - mpz_fdiv_qr(q, hgcd->r[2], a, b); - - if (mpz_size (hgcd->r[2]) <= M) + res = mpz_cmp (a, b); + if (res < 0) { - mpz_clear (q); - return 0; - } - - mpz_set (hgcd->r[0], a); mpz_set (hgcd->r[1], b); + mpz_sub (b, b, a); + if (mpz_size (b) <= s) + return 0; - mpz_set_ui (hgcd->u[0], 1); mpz_set_ui (hgcd->v[0], 0); - mpz_set_ui (hgcd->u[1], 0); mpz_set_ui (hgcd->v[1], 1); - mpz_set_ui (hgcd->u[2], 1); mpz_set (hgcd->v[2], q); + mpz_set_ui (hgcd->m[0][0], 1); mpz_set_ui (hgcd->m[0][1], 0); + mpz_set_ui (hgcd->m[1][0], 1); mpz_set_ui (hgcd->m[1][1], 1); + } + else if (res > 0) + { + mpz_sub (a, a, b); + if (mpz_size (a) <= s) + return 0; - hgcd->sign = 0; + mpz_set_ui (hgcd->m[0][0], 1); mpz_set_ui (hgcd->m[0][1], 1); + mpz_set_ui (hgcd->m[1][0], 0); mpz_set_ui (hgcd->m[1][1], 1); + } + else + return 0; - mpz_init (t); + mpz_init (q); for (;;) { - mpz_fdiv_qr(q, hgcd->r[3], hgcd->r[1], hgcd->r[2]); + ASSERT (mpz_size (a) > s); + ASSERT (mpz_size (b) > s); - mpz_mul (hgcd->u[3], q, hgcd->u[2]); - mpz_add (hgcd->u[3], hgcd->u[3], hgcd->u[1]); - - mpz_mul (hgcd->v[3], q, hgcd->v[2]); - mpz_add (hgcd->v[3], hgcd->v[3], hgcd->v[1]); - - if (mpz_size (hgcd->r[3]) <= M) + if (mpz_cmp (a, b) > 0) { -#if 0 - unsigned i; - printf("hgcd_ref: sign = %d\n", hgcd->sign); - for (i = 0; i < 4; i++) - gmp_printf("r = %Zd, u = %Zd, v = %Zd\n", - hgcd->r[i], hgcd->u[i], hgcd->v[i]); -#endif - /* Check Jebelean's criterion */ - - if (hgcd->sign >= 0) - { - /* Check if r1 - r2 >= u2 - u1 */ - mpz_add (t, hgcd->u[2], hgcd->u[1]); - } - else - { - /* Check if r1 - r2 >= v2 - v1 */ - mpz_add (t, hgcd->v[2], hgcd->v[1]); - } - - /* Check r1 >= t + r2 */ - mpz_add (t, t, hgcd->r[2]); - if (mpz_cmp (hgcd->r[1], t) < 0) - { - res = 2; break; - } - - /* Now r2 is correct */ - if (hgcd->sign >= 0) - { - /* Check r3 >= max (-u3, -v3) = u3 */ - if (mpz_cmp (hgcd->r[3], hgcd->u[3]) < 0) - { - res = 3; break; - } - - /* Check r3 - r2 >= v3 - v2 */ - mpz_add (t, hgcd->v[3], hgcd->v[2]); - } - else - { - /* Check r3 >= max (-u3, -v3) = v3 */ - if (mpz_cmp (hgcd->r[3], hgcd->v[3]) < 0) - { - res = 3; break; - } - - /* Check r3 - r2 >= u3 - u2 */ - mpz_add (t, hgcd->u[3], hgcd->u[2]); - } - - /* Check r2 >= t + r3 */ - mpz_add (t, t, hgcd->r[3]); - if (mpz_cmp (hgcd->r[2], t) < 0) - { - res = 3; break; - } - - /* Now r3 is correct */ - res = 4; break; + if (!sdiv_qr (q, a, s, a, b)) + break; + mpz_addmul (hgcd->m[0][1], q, hgcd->m[0][0]); + mpz_addmul (hgcd->m[1][1], q, hgcd->m[1][0]); + } + else + { + if (!sdiv_qr (q, b, s, b, a)) + break; + mpz_addmul (hgcd->m[0][0], q, hgcd->m[0][1]); + mpz_addmul (hgcd->m[1][0], q, hgcd->m[1][1]); } - - /* Shift rows */ - hgcd->sign = ~hgcd->sign; - mpz_swap (hgcd->r[0], hgcd->r[1]); - mpz_swap (hgcd->r[1], hgcd->r[2]); - mpz_swap (hgcd->r[2], hgcd->r[3]); - - mpz_swap (hgcd->u[0], hgcd->u[1]); - mpz_swap (hgcd->u[1], hgcd->u[2]); - mpz_swap (hgcd->u[2], hgcd->u[3]); - - mpz_swap (hgcd->v[0], hgcd->v[1]); - mpz_swap (hgcd->v[1], hgcd->v[2]); - mpz_swap (hgcd->v[2], hgcd->v[3]); } - mpz_clear (t); mpz_clear (q); - return res; + asize = mpz_size (a); + bsize = mpz_size (b); + return MAX (asize, bsize); } static int @@ -416,25 +381,22 @@ mpz_mpn_equal (const mpz_t a, mp_srcptr bp, mp_size_t bsize) mp_size_t asize = a->_mp_size; MPN_NORMALIZE (bp, bsize); - return asize == bsize && mpn_cmp(ap, bp, asize) == 0; + return asize == bsize && mpn_cmp (ap, bp, asize) == 0; } static int -hgcd_ref_equal (const struct hgcd *hgcd, const struct hgcd_ref *ref) +hgcd_ref_equal (const struct hgcd_matrix *hgcd, const struct hgcd_ref *ref) { unsigned i; - if (ref->sign != hgcd->sign) - return 0; - - for (i = 0; i<4; i++) + for (i = 0; i<2; i++) { - if (!mpz_mpn_equal (ref->r[i], hgcd->row[i].rp, hgcd->row[i].rsize)) - return 0; - if (!mpz_mpn_equal (ref->u[i], hgcd->row[i].uvp[0], hgcd->size)) - return 0; - if (!mpz_mpn_equal (ref->v[i], hgcd->row[i].uvp[1], hgcd->size)) - return 0; + unsigned j; + + for (j = 0; j<2; j++) + if (!mpz_mpn_equal (ref->m[i][j], hgcd->p[i][j], hgcd->n)) + return 0; } + return 1; } diff --git a/tests/mpn/t-matrix22.c b/tests/mpn/t-matrix22.c new file mode 100644 index 000000000..17d1dc614 --- /dev/null +++ b/tests/mpn/t-matrix22.c @@ -0,0 +1,207 @@ +/* Tests matrix22_mul. + +Copyright 2008 Free +Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + +#include <stdio.h> +#include <stdlib.h> + +#include "gmp.h" +#include "gmp-impl.h" +#include "tests.h" + +struct matrix { + mp_size_t alloc; + mp_size_t n; + mp_ptr e00, e01, e10, e11; +}; + +static void +matrix_init (struct matrix *M, mp_size_t n) +{ + mp_ptr p = refmpn_malloc_limbs (4*(n+1)); + M->e00 = p; p += n+1; + M->e01 = p; p += n+1; + M->e10 = p; p += n+1; + M->e11 = p; + M->alloc = n + 1; + M->n = 0; +} + +static void +matrix_clear (struct matrix *M) +{ + refmpn_free_limbs (M->e00); +} + +static void +matrix_copy (struct matrix *R, const struct matrix *M) +{ + R->n = M->n; + MPN_COPY (R->e00, M->e00, M->n); + MPN_COPY (R->e01, M->e01, M->n); + MPN_COPY (R->e10, M->e10, M->n); + MPN_COPY (R->e11, M->e11, M->n); +} + +/* Used with same size, so no need for normalization. */ +static int +matrix_equal_p (const struct matrix *A, const struct matrix *B) +{ + return (A->n == B->n + && mpn_cmp (A->e00, B->e00, A->n) == 0 + && mpn_cmp (A->e01, B->e01, A->n) == 0 + && mpn_cmp (A->e10, B->e10, A->n) == 0 + && mpn_cmp (A->e11, B->e11, A->n) == 0); +} + +static void +matrix_random(struct matrix *M, mp_size_t n, gmp_randstate_ptr rands) +{ + M->n = n; + mpn_random (M->e00, n); + mpn_random (M->e01, n); + mpn_random (M->e10, n); + mpn_random (M->e11, n); +} + +#define MUL(rp, ap, an, bp, bn) do { \ + if (an > bn) \ + mpn_mul (rp, ap, an, bp, bn); \ + else \ + mpn_mul (rp, bp, bn, ap, an); \ + } while(0) + +static void +ref_matrix22_mul (struct matrix *R, + const struct matrix *A, + const struct matrix *B, mp_ptr tp) +{ + mp_size_t an, bn, n; + mp_ptr r00, r01, r10, r11, a00, a01, a10, a11, b00, b01, b10, b11; + + if (A->n >= B->n) + { + r00 = R->e00; a00 = A->e00; b00 = B->e00; + r01 = R->e01; a01 = A->e01; b01 = B->e01; + r10 = R->e10; a10 = A->e10; b10 = B->e10; + r11 = R->e11; a11 = A->e11; b11 = B->e11; + an = A->n, bn = B->n; + } + else + { + /* Transpose */ + r00 = R->e00; a00 = B->e00; b00 = A->e00; + r01 = R->e10; a01 = B->e10; b01 = A->e10; + r10 = R->e01; a10 = B->e01; b10 = A->e01; + r11 = R->e11; a11 = B->e11; b11 = A->e11; + an = B->n, bn = A->n; + } + n = an + bn; + R->n = n + 1; + + mpn_mul (r00, a00, an, b00, bn); + mpn_mul (tp, a01, an, b10, bn); + r00[n] = mpn_add_n (r00, r00, tp, n); + + mpn_mul (r01, a00, an, b01, bn); + mpn_mul (tp, a01, an, b11, bn); + r01[n] = mpn_add_n (r01, r01, tp, n); + + mpn_mul (r10, a10, an, b00, bn); + mpn_mul (tp, a11, an, b10, bn); + r10[n] = mpn_add_n (r10, r10, tp, n); + + mpn_mul (r11, a10, an, b01, bn); + mpn_mul (tp, a11, an, b11, bn); + r11[n] = mpn_add_n (r11, r11, tp, n); +} + +static void +one_test (const struct matrix *A, const struct matrix *B, int i) +{ + struct matrix R; + struct matrix P; + mp_ptr tp; + + matrix_init (&R, A->n + B->n + 1); + matrix_init (&P, A->n + B->n + 1); + + tp = refmpn_malloc_limbs (mpn_matrix22_mul_itch (A->n, B->n)); + + ref_matrix22_mul (&R, A, B, tp); + matrix_copy (&P, A); + mpn_matrix22_mul (P.e00, P.e01, P.e10, P.e11, A->n, + B->e00, B->e01, B->e10, B->e11, B->n, tp); + P.n = A->n + B->n + 1; + if (!matrix_equal_p (&R, &P)) + { + fprintf (stderr, "ERROR in test %d\n", i); + gmp_fprintf (stderr, "A = (%Nx, %Nx\n %Nx, %Nx)\n" + "B = (%Nx, %Nx\n %Nx, %Nx)\n" + "R = (%Nx, %Nx (expected)\n %Nx, %Nx)\n" + "P = (%Nx, %Nx (incorrect)\n %Nx, %Nx)\n", + A->e00, A->n, A->e01, A->n, A->e10, A->n, A->e11, A->n, + B->e00, B->n, B->e01, B->n, B->e10, B->n, B->e11, B->n, + R.e00, R.n, R.e01, R.n, R.e10, R.n, R.e11, R.n, + P.e00, P.n, P.e01, P.n, P.e10, P.n, P.e11, P.n); + abort(); + } + refmpn_free_limbs (tp); + matrix_clear (&R); + matrix_clear (&P); +} + +#define MAX_SIZE (2+2*MATRIX22_STRASSEN_THRESHOLD) + +int +main (int argc, char **argv) +{ + struct matrix A; + struct matrix B; + + gmp_randstate_ptr rands; + mpz_t bs; + int i; + + tests_start (); + rands = RANDS; + + matrix_init (&A, MAX_SIZE); + matrix_init (&B, MAX_SIZE); + mpz_init (bs); + + for (i = 0; i < 17; i++) + { + mp_size_t an, bn; + mpz_urandomb (bs, rands, 32); + an = 1 + mpz_get_ui (bs) % MAX_SIZE; + mpz_urandomb (bs, rands, 32); + bn = 1 + mpz_get_ui (bs) % MAX_SIZE; + + matrix_random (&A, an, rands); + matrix_random (&B, bn, rands); + + one_test (&A, &B, i); + } + mpz_clear (bs); + matrix_clear (&A); + matrix_clear (&B); + + return 0; +} diff --git a/tests/mpz/t-gcd.c b/tests/mpz/t-gcd.c index 13065bdab..a58832861 100644 --- a/tests/mpz/t-gcd.c +++ b/tests/mpz/t-gcd.c @@ -82,10 +82,10 @@ check_data (void) to reinitialize them for each test. */ mpz_t gcd1, gcd2, s, t, temp1, temp2; -#if GCD_SCHOENHAGE_THRESHOLD > GCDEXT_SCHOENHAGE_THRESHOLD -#define MAX_SCHOENHAGE_THRESHOLD GCD_SCHOENHAGE_THRESHOLD +#if GCD_DC_THRESHOLD > GCDEXT_DC_THRESHOLD +#define MAX_SCHOENHAGE_THRESHOLD GCD_DC_THRESHOLD #else -#define MAX_SCHOENHAGE_THRESHOLD GCDEXT_SCHOENHAGE_THRESHOLD +#define MAX_SCHOENHAGE_THRESHOLD GCDEXT_DC_THRESHOLD #endif /* Define this to make all operands be large enough for Schoenhage gcd @@ -252,6 +252,7 @@ one_test (mpz_t op1, mpz_t op2, mpz_t ref, int i) fprintf (stderr, "op1="); debug_mp (op1, -16); fprintf (stderr, "op2="); debug_mp (op2, -16); fprintf (stderr, "mpz_gcdext returns:\n");debug_mp (gcd1, -16); + fprintf (stderr, "s="); debug_mp (s, -16); abort (); } diff --git a/tune/Makefile.am b/tune/Makefile.am index 8748cbc4d..96d90ae77 100644 --- a/tune/Makefile.am +++ b/tune/Makefile.am @@ -41,7 +41,7 @@ EXTRA_LTLIBRARIES = libspeed.la libspeed_la_SOURCES = \ common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c \ - freq.c gcd_bin.c gcd_accel.c gcd_finda_gen.c \ + freq.c \ gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c \ jacbase1.c jacbase2.c jacbase3.c \ mod_1_div.c mod_1_inv.c modlinv.c \ @@ -124,7 +124,7 @@ DISTCLEANFILES = sqr_basecase.c $(MANY_DISTCLEAN) TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c TUNE_MPN_SRCS_BASIC = dc_divrem_n.c divrem_2.c gcd.c gcdext.c get_str.c \ - set_str.c hgcd.c mul_n.c mullow_n.c mul_fft.c mul.c sb_divrem_mn.c tdiv_qr.c + set_str.c matrix22_mul.c hgcd.c mul_n.c mullow_n.c mul_fft.c mul.c sb_divrem_mn.c tdiv_qr.c $(TUNE_MPN_SRCS_BASIC): for i in $(TUNE_MPN_SRCS_BASIC); do \ diff --git a/tune/common.c b/tune/common.c index c7b9b4e61..9efd4f85a 100644 --- a/tune/common.c +++ b/tune/common.c @@ -999,18 +999,71 @@ speed_mpn_mullow_basecase (struct speed_params *s) } double +speed_mpn_matrix22_mul (struct speed_params *s) +{ + /* Speed params only includes 2 inputs, so we have to invent the + other 6. */ + + mp_ptr a1, a2, a3; + mp_ptr r0, r1, r2, r3; + mp_ptr b1, b2, b3; + mp_ptr tp; + mp_size_t scratch; + unsigned i; + double t; + TMP_DECL; + + TMP_MARK; + SPEED_TMP_ALLOC_LIMBS (a1, s->size, s->align_xp); + SPEED_TMP_ALLOC_LIMBS (a2, s->size, s->align_xp); + SPEED_TMP_ALLOC_LIMBS (a3, s->size, s->align_xp); + + SPEED_TMP_ALLOC_LIMBS (b1, s->size, s->align_yp); + SPEED_TMP_ALLOC_LIMBS (b2, s->size, s->align_yp); + SPEED_TMP_ALLOC_LIMBS (b3, s->size, s->align_yp); + + SPEED_TMP_ALLOC_LIMBS (r0, 2 * s->size +1, s->align_xp); + SPEED_TMP_ALLOC_LIMBS (r1, 2 * s->size +1, s->align_xp); + SPEED_TMP_ALLOC_LIMBS (r2, 2 * s->size +1, s->align_xp); + SPEED_TMP_ALLOC_LIMBS (r3, 2 * s->size +1, s->align_xp); + + mpn_random (a1, s->size); + mpn_random (a2, s->size); + mpn_random (a3, s->size); + mpn_random (b1, s->size); + mpn_random (b2, s->size); + mpn_random (b3, s->size); + + scratch = mpn_matrix22_mul_itch (s->size, s->size); + SPEED_TMP_ALLOC_LIMBS (tp, scratch, s->align_wp); + + speed_starttime (); + i = s->reps; + do + { + MPN_COPY (r0, s->xp, s->size); + MPN_COPY (r1, a1, s->size); + MPN_COPY (r2, a2, s->size); + MPN_COPY (r3, a3, s->size); + mpn_matrix22_mul (r0, r1, r2, r3, s->size, s->yp, b1, b2, b3, s->size, tp); + } + while (--i != 0); + t = speed_endtime(); + TMP_FREE; + return t; +} + +double speed_mpn_hgcd (struct speed_params *s) { mp_ptr wp; - mp_size_t hgcd_init_scratch = mpn_hgcd_init_itch (s->size); - mp_size_t qstack_scratch = qstack_itch (s->size); + mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); mp_size_t hgcd_scratch = mpn_hgcd_itch (s->size); mp_ptr ap; mp_ptr bp; mp_ptr tmp1, tmp2; - struct hgcd hgcd; - struct qstack quotients; + struct hgcd_matrix hgcd; int res; unsigned i; double t; @@ -1024,53 +1077,38 @@ speed_mpn_hgcd (struct speed_params *s) SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); - MPN_COPY (ap, s->xp, s->size); - MPN_COPY (bp, s->yp, s->size); - ap[s->size - 1] |= 1; - bp[s->size - 1] |= 1; - - /* We must have a >= b */ - if (mpn_cmp (ap, bp, s->size) < 0) - MP_PTR_SWAP (ap, bp); + s->xp[s->size - 1] |= 1; + s->yp[s->size - 1] |= 1; SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp); - mpn_hgcd_init (&hgcd, s->size, tmp1); - SPEED_TMP_ALLOC_LIMBS (tmp2, qstack_scratch, s->align_wp); - qstack_init ("ients, s->size, tmp2, qstack_scratch); + mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp); speed_starttime (); i = s->reps; do { - qstack_reset ("ients, s->size); - res = mpn_hgcd (&hgcd, ap, s->size, bp, s->size, - "ients, - wp, hgcd_scratch); + MPN_COPY (ap, s->xp, s->size); + MPN_COPY (bp, s->yp, s->size); + res = mpn_hgcd (ap, bp, s->size, &hgcd, wp); } while (--i != 0); t = speed_endtime (); -#if WANT_ASSERT - if (res) - ASSERT_HGCD (&hgcd, ap, s->size, bp, s->size, 0, 4); -#endif TMP_FREE; return t; } -#if 0 + double speed_mpn_hgcd_lehmer (struct speed_params *s) { mp_ptr wp; - mp_size_t hgcd_init_scratch = mpn_hgcd_init_itch (s->size); - mp_size_t qstack_scratch = qstack_itch (s->size); - mp_size_t hgcd_scratch = mpn_hgcd_itch (s->size); + mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); + mp_size_t hgcd_scratch = MPN_HGCD_LEHMER_ITCH (s->size); mp_ptr ap; mp_ptr bp; mp_ptr tmp1, tmp2; - struct hgcd hgcd; - struct qstack quotients; + struct hgcd_matrix hgcd; int res; unsigned i; double t; @@ -1084,45 +1122,33 @@ speed_mpn_hgcd_lehmer (struct speed_params *s) SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); - MPN_COPY (ap, s->xp, s->size); - MPN_COPY (bp, s->yp, s->size); - ap[s->size - 1] |= 1; - bp[s->size - 1] |= 1; - - /* We must have a >= b */ - if (mpn_cmp (ap, bp, s->size) < 0) - MP_PTR_SWAP (ap, bp); + s->xp[s->size - 1] |= 1; + s->yp[s->size - 1] |= 1; SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp); - mpn_hgcd_init (&hgcd, s->size, tmp1); - SPEED_TMP_ALLOC_LIMBS (tmp2, qstack_scratch, s->align_wp); - qstack_init ("ients, s->size, tmp2, qstack_scratch); + mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp); speed_starttime (); i = s->reps; do { - qstack_reset ("ients, s->size); - res = mpn_hgcd_lehmer (&hgcd, ap, s->size, bp, s->size, - "ients, - wp, hgcd_scratch); + MPN_COPY (ap, s->xp, s->size); + MPN_COPY (bp, s->yp, s->size); + res = mpn_hgcd_lehmer (ap, bp, s->size, &hgcd, wp); } while (--i != 0); t = speed_endtime (); -#if WANT_ASSERT - if (res) - ASSERT_HGCD (&hgcd, ap, s->size, bp, s->size, 0, 4); -#endif TMP_FREE; return t; } -#endif + double speed_mpn_gcd (struct speed_params *s) { SPEED_ROUTINE_MPN_GCD (mpn_gcd); } +#if 0 double speed_mpn_gcd_binary (struct speed_params *s) { @@ -1133,7 +1159,7 @@ speed_mpn_gcd_accel (struct speed_params *s) { SPEED_ROUTINE_MPN_GCD (mpn_gcd_accel); } - +#endif #if HAVE_NATIVE_mpn_gcd_finda double speed_mpn_gcd_finda (struct speed_params *s) diff --git a/tune/speed.c b/tune/speed.c index 90e3990de..abe9e70b8 100644 --- a/tune/speed.c +++ b/tune/speed.c @@ -255,17 +255,20 @@ const struct routine_t { { "mpn_popcount", speed_mpn_popcount }, { "mpn_hamdist", speed_mpn_hamdist }, + { "mpn_matrix22_mul", speed_mpn_matrix22_mul }, + { "mpn_hgcd", speed_mpn_hgcd }, -#if 0 { "mpn_hgcd_lehmer", speed_mpn_hgcd_lehmer }, -#endif + { "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL }, { "mpn_gcd_1N", speed_mpn_gcd_1N, FLAG_R_OPTIONAL }, { "mpn_gcd", speed_mpn_gcd }, +#if 0 { "mpn_gcd_binary", speed_mpn_gcd_binary }, { "mpn_gcd_accel", speed_mpn_gcd_accel }, { "find_a", speed_find_a, FLAG_NODATA }, +#endif #if HAVE_NATIVE_mpn_gcd_finda { "mpn_gcd_finda", speed_mpn_gcd_finda, FLAG_NODATA }, #endif diff --git a/tune/speed.h b/tune/speed.h index c2055ca4a..ff8a8f73c 100644 --- a/tune/speed.h +++ b/tune/speed.h @@ -182,6 +182,7 @@ double speed_mpn_divrem_2 _PROTO ((struct speed_params *s)); double speed_mpn_divrem_2_div _PROTO ((struct speed_params *s)); double speed_mpn_divrem_2_inv _PROTO ((struct speed_params *s)); double speed_mpn_fib2_ui _PROTO ((struct speed_params *s)); +double speed_mpn_matrix22_mul _PROTO ((struct speed_params *s)); double speed_mpn_hgcd _PROTO ((struct speed_params *s)); double speed_mpn_hgcd_lehmer _PROTO ((struct speed_params *s)); double speed_mpn_gcd _PROTO ((struct speed_params *s)); diff --git a/tune/tuneup.c b/tune/tuneup.c index fa6778dba..6d1acf9e0 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -162,10 +162,11 @@ mp_size_t mullow_mul_n_threshold = MP_SIZE_T_MAX; mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX; mp_size_t div_dc_threshold = MP_SIZE_T_MAX; mp_size_t powm_threshold = MP_SIZE_T_MAX; -mp_size_t hgcd_schoenhage_threshold = MP_SIZE_T_MAX; +mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX; +mp_size_t hgcd_threshold = MP_SIZE_T_MAX; mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX; -mp_size_t gcd_schoenhage_threshold = MP_SIZE_T_MAX; -mp_size_t gcdext_schoenhage_threshold = MP_SIZE_T_MAX; +mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX; +mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX; mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX; mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX; mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX; @@ -1007,17 +1008,27 @@ tune_powm (void) void +tune_matrix22_mul (void) +{ + static struct param_t param; + param.name = "MATRIX22_STRASSEN_THRESHOLD"; + param.function = speed_mpn_matrix22_mul; + param.min_size = 2; + one (&matrix22_strassen_threshold, ¶m); +} + +void tune_hgcd (void) { static struct param_t param; - param.name = "HGCD_SCHOENHAGE_THRESHOLD"; + param.name = "HGCD_THRESHOLD"; param.function = speed_mpn_hgcd; /* We seem to get strange results for small sizes */ - param.min_size = 50; - param.step_factor = 0.05; - one (&hgcd_schoenhage_threshold, ¶m); + param.min_size = 30; + one (&hgcd_threshold, ¶m); } +#if 0 void tune_gcd_accel (void) { @@ -1027,29 +1038,29 @@ tune_gcd_accel (void) param.min_size = 1; one (&gcd_accel_threshold, ¶m); } - +#endif void -tune_gcd_schoenhage (void) +tune_gcd_dc (void) { static struct param_t param; - param.name = "GCD_SCHOENHAGE_THRESHOLD"; + param.name = "GCD_DC_THRESHOLD"; param.function = speed_mpn_gcd; - param.min_size = hgcd_schoenhage_threshold; + param.min_size = hgcd_threshold; param.max_size = 3000; param.step_factor = 0.1; - one (&gcd_schoenhage_threshold, ¶m); + one (&gcd_dc_threshold, ¶m); } void -tune_gcdext_schoenhage (void) +tune_gcdext_dc (void) { static struct param_t param; - param.name = "GCDEXT_SCHOENHAGE_THRESHOLD"; + param.name = "GCDEXT_DC_THRESHOLD"; param.function = speed_mpn_gcdext; - param.min_size = hgcd_schoenhage_threshold; + param.min_size = hgcd_threshold; param.max_size = 3000; param.step_factor = 0.1; - one (&gcdext_schoenhage_threshold, ¶m); + one (&gcdext_dc_threshold, ¶m); } @@ -1771,10 +1782,13 @@ all (void) tune_powm (); printf("\n"); + tune_matrix22_mul (); tune_hgcd (); + tune_gcd_dc (); + tune_gcdext_dc (); +#if 0 tune_gcd_accel (); - tune_gcd_schoenhage (); - tune_gcdext_schoenhage (); +#endif tune_jacobi_base (); printf("\n"); |