summaryrefslogtreecommitdiff
path: root/mpn
diff options
context:
space:
mode:
Diffstat (limited to 'mpn')
-rw-r--r--mpn/alpha/ev5/gmp-mparam.h40
-rw-r--r--mpn/alpha/ev6/gmp-mparam.h76
-rw-r--r--mpn/asm-defs.m417
-rw-r--r--mpn/generic/gcd_subdiv_step.c2
-rw-r--r--mpn/generic/hgcd_appr.c181
-rw-r--r--mpn/generic/hgcd_jacobi.c4
-rw-r--r--mpn/generic/hgcd_reduce.c14
-rw-r--r--mpn/generic/hgcd_step.c2
-rw-r--r--mpn/generic/powm.c44
-rw-r--r--mpn/generic/powm_sec.c126
-rw-r--r--mpn/generic/redc_1.c5
-rw-r--r--mpn/generic/tabselect.c (renamed from mpn/generic/redc_1_sec.c)35
-rw-r--r--mpn/generic/udiv_w_sdiv.c6
-rw-r--r--mpn/ia64/gmp-mparam.h148
-rw-r--r--mpn/ia64/tabselect.asm139
-rw-r--r--mpn/pa64/gmp-mparam.h62
-rw-r--r--mpn/powerpc32/aors_n.asm19
-rw-r--r--mpn/powerpc32/p3-p7/aors_n.asm176
-rw-r--r--mpn/powerpc32/p5/gmp-mparam.h137
-rw-r--r--mpn/powerpc32/p6/gmp-mparam.h206
-rw-r--r--mpn/powerpc32/p7/gmp-mparam.h149
-rw-r--r--mpn/powerpc32/tabselect.asm98
-rw-r--r--mpn/powerpc64/com.asm9
-rw-r--r--mpn/powerpc64/copyd.asm9
-rw-r--r--mpn/powerpc64/copyi.asm9
-rw-r--r--mpn/powerpc64/logops_n.asm9
-rw-r--r--mpn/powerpc64/lshift.asm11
-rw-r--r--mpn/powerpc64/lshiftc.asm (renamed from mpn/powerpc64/mode64/lshiftc.asm)16
-rw-r--r--mpn/powerpc64/mode64/aors_n.asm14
-rw-r--r--mpn/powerpc64/mode64/aorscnd_n.asm185
-rw-r--r--mpn/powerpc64/mode64/aorslshC_n.asm11
-rw-r--r--mpn/powerpc64/mode64/aorsmul_1.asm15
-rw-r--r--mpn/powerpc64/mode64/bdiv_dbm1c.asm4
-rw-r--r--mpn/powerpc64/mode64/dive_1.asm11
-rw-r--r--mpn/powerpc64/mode64/divrem_1.asm13
-rw-r--r--mpn/powerpc64/mode64/divrem_2.asm11
-rw-r--r--mpn/powerpc64/mode64/invert_limb.asm11
-rw-r--r--mpn/powerpc64/mode64/mod_1_1.asm11
-rw-r--r--mpn/powerpc64/mode64/mod_1_4.asm11
-rw-r--r--mpn/powerpc64/mode64/mod_34lsub1.asm11
-rw-r--r--mpn/powerpc64/mode64/mode1o.asm10
-rw-r--r--mpn/powerpc64/mode64/mul_1.asm11
-rw-r--r--mpn/powerpc64/mode64/mul_basecase.asm12
-rw-r--r--mpn/powerpc64/mode64/p3/gmp-mparam.h73
-rw-r--r--mpn/powerpc64/mode64/p4/gmp-mparam.h31
-rw-r--r--mpn/powerpc64/mode64/p5/gmp-mparam.h41
-rw-r--r--mpn/powerpc64/mode64/p6/aorsmul_1.asm172
-rw-r--r--mpn/powerpc64/mode64/p6/gmp-mparam.h85
-rw-r--r--mpn/powerpc64/mode64/p6/mul_basecase.asm2
-rw-r--r--mpn/powerpc64/mode64/p7/gmp-mparam.h159
-rw-r--r--mpn/powerpc64/mode64/rsh1add_n.asm11
-rw-r--r--mpn/powerpc64/mode64/rsh1sub_n.asm11
-rw-r--r--mpn/powerpc64/mode64/sqr_basecase.asm852
-rw-r--r--mpn/powerpc64/mode64/sqr_diag_addlsh1.asm238
-rw-r--r--mpn/powerpc64/rshift.asm11
-rw-r--r--mpn/powerpc64/tabselect.asm96
-rw-r--r--mpn/s390_32/esame/gmp-mparam.h86
-rw-r--r--mpn/s390_32/lshift.asm2
-rw-r--r--mpn/s390_32/lshiftc.asm2
-rw-r--r--mpn/s390_32/rshift.asm2
-rw-r--r--mpn/s390_64/README77
-rw-r--r--mpn/s390_64/gmp-mparam.h24
-rw-r--r--mpn/sparc64/ultrasparc34/gmp-mparam.h29
-rw-r--r--mpn/sparc64/ultrasparct1/gmp-mparam.h36
-rw-r--r--mpn/x86/atom/gmp-mparam.h41
-rw-r--r--mpn/x86/atom/lshift.asm4
-rw-r--r--mpn/x86/atom/sse2/mul_1.asm2
-rw-r--r--mpn/x86/bdiv_dbm1c.asm4
-rw-r--r--mpn/x86/bdiv_q_1.asm2
-rw-r--r--mpn/x86/bobcat/gmp-mparam.h142
-rw-r--r--mpn/x86/core2/gmp-mparam.h141
-rw-r--r--mpn/x86/coreinhm/gmp-mparam.h141
-rw-r--r--mpn/x86/coreisbr/gmp-mparam.h140
-rw-r--r--mpn/x86/k10/gmp-mparam.h142
-rw-r--r--mpn/x86/k7/addlsh1_n.asm6
-rw-r--r--mpn/x86/k7/gmp-mparam.h45
-rw-r--r--mpn/x86/k7/invert_limb.asm2
-rw-r--r--mpn/x86/k7/sublsh1_n.asm8
-rw-r--r--mpn/x86/k8/gmp-mparam.h144
-rw-r--r--mpn/x86/nano/gmp-mparam.h152
-rw-r--r--mpn/x86/p6/bdiv_q_1.asm4
-rw-r--r--mpn/x86/p6/sse2/gmp-mparam.h61
-rw-r--r--mpn/x86/pentium/bdiv_q_1.asm2
-rw-r--r--mpn/x86/pentium4/sse2/gmp-mparam.h85
-rw-r--r--mpn/x86/tabselect.asm104
-rw-r--r--mpn/x86_64/addmul_2.asm7
-rw-r--r--mpn/x86_64/aorrlsh1_n.asm8
-rw-r--r--mpn/x86_64/aorrlsh2_n.asm5
-rw-r--r--mpn/x86_64/aorrlshC_n.asm7
-rw-r--r--mpn/x86_64/aorrlsh_n.asm14
-rw-r--r--mpn/x86_64/aors_n.asm34
-rw-r--r--mpn/x86_64/aorscnd_n.asm178
-rw-r--r--mpn/x86_64/aorsmul_1.asm51
-rw-r--r--mpn/x86_64/atom/gmp-mparam.h17
-rw-r--r--mpn/x86_64/bdiv_dbm1c.asm16
-rw-r--r--mpn/x86_64/bdiv_q_1.asm21
-rw-r--r--mpn/x86_64/bobcat/gmp-mparam.h10
-rw-r--r--mpn/x86_64/com.asm8
-rw-r--r--mpn/x86_64/copyd.asm9
-rw-r--r--mpn/x86_64/copyi.asm9
-rw-r--r--mpn/x86_64/core2/aorrlsh1_n.asm5
-rw-r--r--mpn/x86_64/core2/aorrlsh2_n.asm5
-rw-r--r--mpn/x86_64/core2/aorrlsh_n.asm4
-rw-r--r--mpn/x86_64/core2/aors_n.asm19
-rw-r--r--mpn/x86_64/core2/aorsmul_1.asm8
-rw-r--r--mpn/x86_64/core2/gmp-mparam.h23
-rw-r--r--mpn/x86_64/core2/lshift.asm39
-rw-r--r--mpn/x86_64/core2/lshiftc.asm39
-rw-r--r--mpn/x86_64/core2/rsh1aors_n.asm17
-rw-r--r--mpn/x86_64/core2/rshift.asm39
-rw-r--r--mpn/x86_64/core2/sublsh1_n.asm5
-rw-r--r--mpn/x86_64/core2/sublsh2_n.asm5
-rw-r--r--mpn/x86_64/core2/sublshC_n.asm4
-rw-r--r--mpn/x86_64/coreinhm/aorrlsh_n.asm17
-rw-r--r--mpn/x86_64/coreinhm/gmp-mparam.h113
-rw-r--r--mpn/x86_64/coreisbr/aors_n.asm14
-rw-r--r--mpn/x86_64/coreisbr/gmp-mparam.h166
-rw-r--r--mpn/x86_64/div_qr_2n_pi1.asm6
-rw-r--r--mpn/x86_64/div_qr_2u_pi1.asm6
-rw-r--r--mpn/x86_64/dos64.m439
-rw-r--r--mpn/x86_64/gmp-mparam.h13
-rw-r--r--mpn/x86_64/invert_limb.asm6
-rw-r--r--mpn/x86_64/invert_limb_table.asm3
-rw-r--r--mpn/x86_64/logops_n.asm16
-rw-r--r--mpn/x86_64/lshift.asm11
-rw-r--r--mpn/x86_64/lshiftc.asm7
-rw-r--r--mpn/x86_64/lshsub_n.asm16
-rw-r--r--mpn/x86_64/mod_1_1.asm11
-rw-r--r--mpn/x86_64/mod_1_2.asm9
-rw-r--r--mpn/x86_64/mod_1_4.asm15
-rw-r--r--mpn/x86_64/mod_34lsub1.asm12
-rw-r--r--mpn/x86_64/mul_1.asm55
-rw-r--r--mpn/x86_64/mul_2.asm7
-rw-r--r--mpn/x86_64/mul_basecase.asm14
-rw-r--r--mpn/x86_64/mulmid_basecase.asm14
-rw-r--r--mpn/x86_64/nano/gmp-mparam.h33
-rw-r--r--mpn/x86_64/pentium4/gmp-mparam.h51
-rw-r--r--mpn/x86_64/popham.asm12
-rw-r--r--mpn/x86_64/redc_1.asm73
-rw-r--r--mpn/x86_64/rsh1aors_n.asm17
-rw-r--r--mpn/x86_64/rshift.asm7
-rw-r--r--mpn/x86_64/sqr_basecase.asm9
-rw-r--r--mpn/x86_64/sublsh1_n.asm7
-rw-r--r--mpn/x86_64/tabselect.asm123
-rw-r--r--mpn/x86_64/x86_64-defs.m47
145 files changed, 5455 insertions, 1562 deletions
diff --git a/mpn/alpha/ev5/gmp-mparam.h b/mpn/alpha/ev5/gmp-mparam.h
index a4c794838..395353a46 100644
--- a/mpn/alpha/ev5/gmp-mparam.h
+++ b/mpn/alpha/ev5/gmp-mparam.h
@@ -26,38 +26,44 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 29
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 /* never mpn_mod_1_1p */
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 4
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 14
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 75
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15
#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define DIV_QR_2_PI2_THRESHOLD 21
#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 80
+#define BMOD_1_TO_MOD_1_THRESHOLD 78
-#define MUL_TOOM22_THRESHOLD 18
-#define MUL_TOOM33_THRESHOLD 61
-#define MUL_TOOM44_THRESHOLD 88
+#define MUL_TOOM22_THRESHOLD 14
+#define MUL_TOOM33_THRESHOLD 57
+#define MUL_TOOM44_THRESHOLD 118
#define MUL_TOOM6H_THRESHOLD 173
-#define MUL_TOOM8H_THRESHOLD 0
+#define MUL_TOOM8H_THRESHOLD 240
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57
#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 60
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56
#define SQR_BASECASE_THRESHOLD 4
#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 65
+#define SQR_TOOM3_THRESHOLD 77
#define SQR_TOOM4_THRESHOLD 136
-#define SQR_TOOM6_THRESHOLD 180
-#define SQR_TOOM8_THRESHOLD 248
+#define SQR_TOOM6_THRESHOLD 173
+#define SQR_TOOM8_THRESHOLD 260
+
+#define MULMID_TOOM42_THRESHOLD 20
#define MULMOD_BNM1_THRESHOLD 11
#define SQRMOD_BNM1_THRESHOLD 13
+#define POWM_SEC_TABLE 2,17,322,387
+
#define MUL_FFT_MODF_THRESHOLD 244 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 244, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
@@ -161,9 +167,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 942
#define MATRIX22_STRASSEN_THRESHOLD 13
-#define HGCD_THRESHOLD 101
-#define GCD_DC_THRESHOLD 330
-#define GCDEXT_DC_THRESHOLD 222
+#define HGCD_THRESHOLD 105
+#define HGCD_APPR_THRESHOLD 111
+#define HGCD_REDUCE_THRESHOLD 1437
+#define GCD_DC_THRESHOLD 318
+#define GCDEXT_DC_THRESHOLD 214
#define JACOBI_BASE_METHOD 2
#define GET_STR_DC_THRESHOLD 16
diff --git a/mpn/alpha/ev6/gmp-mparam.h b/mpn/alpha/ev6/gmp-mparam.h
index 12c3891d7..ce865f4cc 100644
--- a/mpn/alpha/ev6/gmp-mparam.h
+++ b/mpn/alpha/ev6/gmp-mparam.h
@@ -29,38 +29,44 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 6
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 30
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 4
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define DIV_QR_2_PI2_THRESHOLD 8
#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 16
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
#define MUL_TOOM22_THRESHOLD 35
-#define MUL_TOOM33_THRESHOLD 74
-#define MUL_TOOM44_THRESHOLD 178
-#define MUL_TOOM6H_THRESHOLD 288
-#define MUL_TOOM8H_THRESHOLD 333
+#define MUL_TOOM33_THRESHOLD 77
+#define MUL_TOOM44_THRESHOLD 184
+#define MUL_TOOM6H_THRESHOLD 228
+#define MUL_TOOM8H_THRESHOLD 288
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 75
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 101
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 73
-#define SQR_BASECASE_THRESHOLD 5
-#define SQR_TOOM2_THRESHOLD 61
-#define SQR_TOOM3_THRESHOLD 107
-#define SQR_TOOM4_THRESHOLD 170
-#define SQR_TOOM6_THRESHOLD 309
-#define SQR_TOOM8_THRESHOLD 360
+#define SQR_BASECASE_THRESHOLD 0 /* always */
+#define SQR_TOOM2_THRESHOLD 58
+#define SQR_TOOM3_THRESHOLD 103
+#define SQR_TOOM4_THRESHOLD 172
+#define SQR_TOOM6_THRESHOLD 264
+#define SQR_TOOM8_THRESHOLD 333
+
+#define MULMID_TOOM42_THRESHOLD 52
#define MULMOD_BNM1_THRESHOLD 20
#define SQRMOD_BNM1_THRESHOLD 23
+#define POWM_SEC_TABLE 4,17,246,2388
+
#define MUL_FFT_MODF_THRESHOLD 480 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 480, 5}, { 18, 6}, { 10, 5}, { 21, 6}, \
@@ -148,19 +154,19 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 3136
#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 130
-#define MULLO_MUL_N_THRESHOLD 15604
+#define MULLO_DC_THRESHOLD 173
+#define MULLO_MUL_N_THRESHOLD 11355
-#define DC_DIV_QR_THRESHOLD 119
-#define DC_DIVAPPR_Q_THRESHOLD 390
+#define DC_DIV_QR_THRESHOLD 112
+#define DC_DIVAPPR_Q_THRESHOLD 422
#define DC_BDIV_QR_THRESHOLD 110
-#define DC_BDIV_Q_THRESHOLD 318
+#define DC_BDIV_Q_THRESHOLD 348
-#define INV_MULMOD_BNM1_THRESHOLD 75
-#define INV_NEWTON_THRESHOLD 390
-#define INV_APPR_THRESHOLD 372
+#define INV_MULMOD_BNM1_THRESHOLD 68
+#define INV_NEWTON_THRESHOLD 402
+#define INV_APPR_THRESHOLD 396
-#define BINV_NEWTON_THRESHOLD 393
+#define BINV_NEWTON_THRESHOLD 399
#define REDC_1_TO_REDC_N_THRESHOLD 110
#define MU_DIV_QR_THRESHOLD 1718
@@ -170,12 +176,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 1652
#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 282
-#define GCD_DC_THRESHOLD 1138
-#define GCDEXT_DC_THRESHOLD 773
+#define HGCD_THRESHOLD 278
+#define HGCD_APPR_THRESHOLD 366
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 1258
+#define GCDEXT_DC_THRESHOLD 777
#define JACOBI_BASE_METHOD 3
-#define GET_STR_DC_THRESHOLD 14
-#define GET_STR_PRECOMPUTE_THRESHOLD 19
-#define SET_STR_DC_THRESHOLD 3754
-#define SET_STR_PRECOMPUTE_THRESHOLD 8097
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 25
+#define SET_STR_DC_THRESHOLD 3539
+#define SET_STR_PRECOMPUTE_THRESHOLD 7784
diff --git a/mpn/asm-defs.m4 b/mpn/asm-defs.m4
index 4f049b21b..b95cad7c0 100644
--- a/mpn/asm-defs.m4
+++ b/mpn/asm-defs.m4
@@ -1471,6 +1471,7 @@ define_mpn(sub_n)
define_mpn(sub_nc)
define_mpn(submul_1)
define_mpn(submul_1c)
+define_mpn(tabselect)
define_mpn(umul_ppmm)
define_mpn(umul_ppmm_r)
define_mpn(udiv_qrnnd)
@@ -1712,6 +1713,22 @@ m4_assert_numargs(1)
)
+dnl Usage: ABI_SUPPORT(abi)
+dnl
+dnl A dummy macro which is grepped for by ./configure to know what ABIs
+dnl are supported in an asm file.
+dnl
+dnl If multiple non-standard ABIs are supported, several ABI_SUPPORT
+dnl declarations should be used:
+dnl
+dnl ABI_SUPPORT(FOOABI)
+dnl ABI_SUPPORT(BARABI)
+
+define(ABI_SUPPORT,
+m4_assert_numargs(1)
+)
+
+
dnl Usage: GMP_NUMB_MASK
dnl
dnl A bit mask for the number part of a limb. Eg. with 6 bit nails in a
diff --git a/mpn/generic/gcd_subdiv_step.c b/mpn/generic/gcd_subdiv_step.c
index 11c00bb6a..3db34073c 100644
--- a/mpn/generic/gcd_subdiv_step.c
+++ b/mpn/generic/gcd_subdiv_step.c
@@ -185,7 +185,7 @@ mpn_gcd_subdiv_step (mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t s,
}
else
MPN_COPY (bp, ap, an);
-
+
MPN_DECR_U (tp, qn, 1);
}
diff --git a/mpn/generic/hgcd_appr.c b/mpn/generic/hgcd_appr.c
index 963eaea47..f7c7eb2c9 100644
--- a/mpn/generic/hgcd_appr.c
+++ b/mpn/generic/hgcd_appr.c
@@ -25,172 +25,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "longlong.h"
-/* Computes R -= A * B. Result must be non-negative. Normalized down
- to size an, and resulting size is returned. */
-static mp_size_t
-submul (mp_ptr rp, mp_size_t rn,
- mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn)
-{
- mp_ptr tp;
- TMP_DECL;
-
- ASSERT (bn > 0);
- ASSERT (an >= bn);
- ASSERT (rn >= an);
- ASSERT (an + bn <= rn + 1);
-
- TMP_MARK;
- tp = TMP_ALLOC_LIMBS (an + bn);
-
- mpn_mul (tp, ap, an, bp, bn);
- if (an + bn > rn)
- {
- ASSERT (tp[rn] == 0);
- bn--;
- }
- ASSERT_NOCARRY (mpn_sub (rp, rp, rn, tp, an + bn));
- TMP_FREE;
-
- while (rn > an && (rp[rn-1] == 0))
- rn--;
-
- return rn;
-}
-
-/* Computes (a, b) <-- M^{-1} (a; b) */
-/* FIXME:
- x Take scratch parameter, and figure out scratch need.
-
- x Use some fallback for small M->n?
-*/
-static mp_size_t
-hgcd_matrix_apply (const struct hgcd_matrix *M,
- mp_ptr ap, mp_ptr bp,
- mp_size_t n)
-{
- mp_size_t an, bn, un, vn, nn;
- mp_size_t mn[2][2];
- mp_size_t modn;
- mp_ptr tp, sp, scratch;
- mp_limb_t cy;
- unsigned i, j;
-
- TMP_DECL;
-
- ASSERT ( (ap[n-1] | bp[n-1]) > 0);
-
- an = n;
- MPN_NORMALIZE (ap, an);
- bn = n;
- MPN_NORMALIZE (bp, bn);
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- {
- mp_size_t k;
- k = M->n;
- MPN_NORMALIZE (M->p[i][j], k);
- mn[i][j] = k;
- }
-
- ASSERT (mn[0][0] > 0);
- ASSERT (mn[1][1] > 0);
- ASSERT ( (mn[0][1] | mn[1][0]) > 0);
-
- TMP_MARK;
-
- if (mn[0][1] == 0)
- {
- mp_size_t qn;
-
- /* A unchanged, M = (1, 0; q, 1) */
- ASSERT (mn[0][0] == 1);
- ASSERT (M->p[0][0][0] == 1);
- ASSERT (mn[1][1] == 1);
- ASSERT (M->p[1][1][0] == 1);
-
- /* Put B <-- B - q A */
- nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]);
- }
- else if (mn[1][0] == 0)
- {
- /* B unchanged, M = (1, q; 0, 1) */
- ASSERT (mn[0][0] == 1);
- ASSERT (M->p[0][0][0] == 1);
- ASSERT (mn[1][1] == 1);
- ASSERT (M->p[1][1][0] == 1);
-
- /* Put A <-- A - q * B */
- nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
- }
- else
- {
- /* A = m00 a + m01 b ==> a <= A / m00, b <= A / m01.
- B = m10 a + m11 b ==> a <= B / m10, b <= B / m11. */
- un = MIN (an - mn[0][0], bn - mn[1][0]) + 1;
- vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1;
-
- nn = MAX (un, vn);
- /* In the range of interest, mulmod_bnm1 should always beat mullo. */
- modn = mpn_mulmod_bnm1_next_size (nn + 1);
-
- scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n));
- tp = TMP_ALLOC_LIMBS (modn);
- sp = TMP_ALLOC_LIMBS (modn);
-
- ASSERT (n <= 2*modn);
-
- if (n > modn)
- {
- cy = mpn_add (ap, ap, modn, ap + modn, n - modn);
- MPN_INCR_U (ap, modn, cy);
-
- cy = mpn_add (bp, bp, modn, bp + modn, n - modn);
- MPN_INCR_U (bp, modn, cy);
-
- n = modn;
- }
-
- mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch);
- mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch);
-
- /* FIXME: Handle the small n case in some better way. */
- if (n + mn[1][1] < modn)
- MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
- if (n + mn[0][1] < modn)
- MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);
-
- cy = mpn_sub_n (tp, tp, sp, modn);
- MPN_DECR_U (tp, modn, cy);
-
- ASSERT (mpn_zero_p (tp + nn, modn - nn));
-
- mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch);
- MPN_COPY (ap, tp, nn);
- mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch);
-
- if (n + mn[1][0] < modn)
- MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]);
- if (n + mn[0][0] < modn)
- MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]);
-
- cy = mpn_sub_n (tp, tp, sp, modn);
- MPN_DECR_U (tp, modn, cy);
-
- ASSERT (mpn_zero_p (tp + nn, modn - nn));
- MPN_COPY (bp, tp, nn);
-
- while ( (ap[nn-1] | bp[nn-1]) == 0)
- {
- nn--;
- ASSERT (nn > 0);
- }
- }
- TMP_FREE;
-
- return nn;
-}
-
/* Identical to mpn_hgcd_itch. FIXME: Do we really need to add
HGCD_THRESHOLD at the end? */
mp_size_t
@@ -238,7 +72,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
we discard some of the least significant limbs, we must keep one
additional bit to account for the truncation error. We maintain
the GMP_NUMB_BITS * s - extra_bits as the current target size. */
-
+
s = n/2 + 1;
if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD))
{
@@ -321,7 +155,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
ASSERT (n <= 2*s);
nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
-
+
if (!nn)
return 1;
@@ -347,13 +181,12 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
{
mp_size_t n2 = (3*n)/4 + 1;
mp_size_t p = n/2;
- mp_size_t input_n = n;
+ mp_size_t nn;
- MPN_COPY (tp, ap + p, n - p);
- MPN_COPY (tp + n - p, bp + p, n - p);
- if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p)))
+ nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp);
+ if (nn)
{
- n = hgcd_matrix_apply (M, ap, bp, n);
+ n = nn;
/* FIXME: Discard some of the low limbs immediately? */
success = 1;
}
@@ -416,7 +249,7 @@ mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
ASSERT (n <= 2*s);
nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
-
+
if (!nn)
return success;
diff --git a/mpn/generic/hgcd_jacobi.c b/mpn/generic/hgcd_jacobi.c
index 2dce43b99..0d4cb021c 100644
--- a/mpn/generic/hgcd_jacobi.c
+++ b/mpn/generic/hgcd_jacobi.c
@@ -26,7 +26,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "longlong.h"
/* This file is almost a copy of hgcd.c, with some added calls to
- mpn_jacobi_update */
+ mpn_jacobi_update */
struct hgcd_jacobi_ctx
{
@@ -127,7 +127,7 @@ hgcd_jacobi_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
struct hgcd_jacobi_ctx ctx;
ctx.M = M;
ctx.bitsp = bitsp;
-
+
return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_jacobi_hook, &ctx, tp);
}
}
diff --git a/mpn/generic/hgcd_reduce.c b/mpn/generic/hgcd_reduce.c
index 142d44a30..89240af4d 100644
--- a/mpn/generic/hgcd_reduce.c
+++ b/mpn/generic/hgcd_reduce.c
@@ -38,7 +38,7 @@ submul (mp_ptr rp, mp_size_t rn,
ASSERT (an >= bn);
ASSERT (rn >= an);
ASSERT (an + bn <= rn + 1);
-
+
TMP_MARK;
tp = TMP_ALLOC_LIMBS (an + bn);
@@ -61,7 +61,7 @@ submul (mp_ptr rp, mp_size_t rn,
/* FIXME:
x Take scratch parameter, and figure out scratch need.
- x Use some fallback for small M->n?
+ x Use some fallback for small M->n?
*/
static mp_size_t
hgcd_matrix_apply (const struct hgcd_matrix *M,
@@ -83,7 +83,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
MPN_NORMALIZE (ap, an);
bn = n;
MPN_NORMALIZE (bp, bn);
-
+
for (i = 0; i < 2; i++)
for (j = 0; j < 2; j++)
{
@@ -102,7 +102,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
if (mn[0][1] == 0)
{
mp_size_t qn;
-
+
/* A unchanged, M = (1, 0; q, 1) */
ASSERT (mn[0][0] == 1);
ASSERT (M->p[0][0][0] == 1);
@@ -121,7 +121,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
ASSERT (M->p[1][1][0] == 1);
/* Put A <-- A - q * B */
- nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
+ nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
}
else
{
@@ -159,7 +159,7 @@ hgcd_matrix_apply (const struct hgcd_matrix *M,
MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
if (n + mn[0][1] < modn)
MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);
-
+
cy = mpn_sub_n (tp, tp, sp, modn);
MPN_DECR_U (tp, modn, cy);
@@ -209,7 +209,7 @@ mpn_hgcd_reduce_itch (mp_size_t n, mp_size_t p)
itch = 2*(n-p) + mpn_hgcd_itch (n-p);
/* Currently, hgcd_matrix_apply allocates its own storage. */
}
- return itch;
+ return itch;
}
/* FIXME: Document storage need. */
diff --git a/mpn/generic/hgcd_step.c b/mpn/generic/hgcd_step.c
index 0e56be39e..dbc757935 100644
--- a/mpn/generic/hgcd_step.c
+++ b/mpn/generic/hgcd_step.c
@@ -112,7 +112,7 @@ mpn_hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
/* Multiply M1^{-1} (a;b) */
return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n);
}
-
+
subtract:
return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_hook, M, tp);
diff --git a/mpn/generic/powm.c b/mpn/generic/powm.c
index 57edfd4f6..fa92362ad 100644
--- a/mpn/generic/powm.c
+++ b/mpn/generic/powm.c
@@ -6,7 +6,7 @@
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
+Copyright 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -74,6 +74,16 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "longlong.h"
+#undef MPN_REDC_1
+#define MPN_REDC_1(rp, up, mp, n, invm) \
+ do { \
+ mp_limb_t cy; \
+ mpn_redc_1 (up, mp, n, invm); \
+ cy = mpn_add_n (rp, up + n, up, n); \
+ if (cy != 0) \
+ mpn_sub_n (rp, rp, mp, n); \
+ } while (0)
+
#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
#define WANT_REDC_2 1
#endif
@@ -212,12 +222,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
mpn_sqr (tp, this_pp, n);
#if WANT_REDC_2
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- mpn_redc_1 (rp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (rp, tp, mp, n, mip[0]);
else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
mpn_redc_2 (rp, tp, mp, n, mip);
#else
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- mpn_redc_1 (rp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (rp, tp, mp, n, mip[0]);
#endif
else
mpn_redc_n (rp, tp, mp, n, mip);
@@ -229,12 +239,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
this_pp += n;
#if WANT_REDC_2
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- mpn_redc_1 (this_pp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
mpn_redc_2 (this_pp, tp, mp, n, mip);
#else
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- mpn_redc_1 (this_pp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
#endif
else
mpn_redc_n (this_pp, tp, mp, n, mip);
@@ -309,7 +319,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else
@@ -319,7 +329,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
}
@@ -380,7 +390,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else
@@ -390,7 +400,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
}
@@ -401,7 +411,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
@@ -440,7 +450,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else
@@ -450,7 +460,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
}
@@ -501,7 +511,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else
@@ -511,7 +521,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
}
@@ -522,7 +532,7 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
-#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_1 (rp, tp, mp, n, mip[0])
+#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
INNERLOOP;
}
else
@@ -545,12 +555,12 @@ mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#if WANT_REDC_2
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
- mpn_redc_1 (rp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (rp, tp, mp, n, mip[0]);
else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
mpn_redc_2 (rp, tp, mp, n, mip);
#else
if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
- mpn_redc_1 (rp, tp, mp, n, mip[0]);
+ MPN_REDC_1 (rp, tp, mp, n, mip[0]);
#endif
else
mpn_redc_n (rp, tp, mp, n, mip);
diff --git a/mpn/generic/powm_sec.c b/mpn/generic/powm_sec.c
index 315ae6e5e..24bb83de3 100644
--- a/mpn/generic/powm_sec.c
+++ b/mpn/generic/powm_sec.c
@@ -7,7 +7,7 @@
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-Copyright 2007, 2008, 2009 Free Software Foundation, Inc.
+Copyright 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -56,6 +56,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define WANT_CACHE_SECURITY 1
+#undef MPN_REDC_1_SEC
+#define MPN_REDC_1_SEC(rp, up, mp, n, invm) \
+ do { \
+ mp_limb_t cy; \
+ mpn_redc_1 (up, mp, n, invm); \
+ cy = mpn_add_n (rp, up + n, up, n); \
+ mpn_subcnd_n (rp, rp, mp, n, cy); \
+ } while (0)
/* Define our own mpn squaring function. We do this since we cannot use a
native mpn_sqr_basecase over TUNE_SQR_TOOM2_MAX, or a non-native one over
@@ -125,8 +133,6 @@ mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp)
if (n > 1)
{
mp_limb_t cy;
- TMP_DECL;
- TMP_MARK;
cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
tp[n - 1] = cy;
@@ -148,8 +154,6 @@ mpn_local_sqr (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr tp)
#endif
rp[2 * n - 1] += cy;
}
-
- TMP_FREE;
}
}
#endif
@@ -181,36 +185,46 @@ getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
}
}
+#ifndef POWM_SEC_TABLE
+#if GMP_NUMB_BITS < 50
+#define POWM_SEC_TABLE 2,33,96,780,2741
+#else
+#define POWM_SEC_TABLE 2,130,524,2578
+#endif
+#endif
+
+#if TUNE_PROGRAM_BUILD
+extern int win_size (mp_bitcnt_t);
+#else
static inline int
win_size (mp_bitcnt_t eb)
{
int k;
- static mp_bitcnt_t x[] = {0,4,27,100,325,1026,2905,7848,20457,51670,~(mp_bitcnt_t)0};
+ static mp_bitcnt_t x[] = {0,POWM_SEC_TABLE,~(mp_bitcnt_t)0};
for (k = 1; eb > x[k]; k++)
;
return k;
}
+#endif
-/* Convert U to REDC form, U_r = B^n * U mod M */
+/* Convert U to REDC form, U_r = B^n * U mod M.
+ Uses scratch space at tp of size 2un + n + 1. */
static void
redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n, mp_ptr tp)
{
mp_ptr qp;
- TMP_DECL;
- TMP_MARK;
- qp = tp + un + n;
+ qp = tp + un + n; /* un + n - n + 1 = un + 1 limbs */
MPN_ZERO (tp, n);
MPN_COPY (tp + n, up, un);
mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n);
- TMP_FREE;
}
/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0]
- Requires that mp[n-1..0] is odd. FIXME: is this true?
- Requires that ep[en-1..0] is > 1.
- Uses scratch space at tp of 3n+1 limbs. */
+ Requires that mp[n-1..0] is odd.
+ Requires that ep[en-1..0] > 1.
+ Uses scratch space at tp as defined by mpn_powm_sec_itch. */
void
mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
mp_srcptr ep, mp_size_t en,
@@ -224,13 +238,10 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
mp_ptr pp, this_pp;
long i;
int cnd;
- TMP_DECL;
ASSERT (en > 1 || (en == 1 && ep[0] > 0));
ASSERT (n >= 1 && ((mp[0] & 1) != 0));
- TMP_MARK;
-
count_leading_zeros (cnt, ep[en - 1]);
ebi = (mp_bitcnt_t) en * GMP_LIMB_BITS - cnt;
@@ -239,20 +250,32 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
binvert_limb (minv, mp[0]);
minv = -minv;
- pp = tp + 4 * n;
+ pp = tp;
+ tp += (n << windowsize); /* put tp after power table */
+ /* Compute pp[0] table entry */
+ /* scratch: | n | 1 | n+2 | */
+ /* | pp[0] | 1 | redcify | */
this_pp = pp;
this_pp[n] = 1;
- redcify (this_pp, this_pp + n, 1, mp, n, tp + 6 * n);
+ redcify (this_pp, this_pp + n, 1, mp, n, this_pp + n + 1);
this_pp += n;
- redcify (this_pp, bp, bn, mp, n, tp + 6 * n);
+
+ /* Compute pp[1] table entry. To avoid excessive scratch usage in the
+ degenerate situation where B >> M, we let redcify use scratch space which
+ will later be used by the pp table (element 2 and up). */
+ /* scratch: | n | n | bn + n + 1 | */
+ /* | pp[0] | pp[1] | redcify | */
+ redcify (this_pp, bp, bn, mp, n, this_pp + n);
/* Precompute powers of b and put them in the temporary area at pp. */
+ /* scratch: | n | n | ... | | 2n | */
+ /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | product | */
for (i = (1 << windowsize) - 2; i > 0; i--)
{
mpn_mul_basecase (tp, this_pp, n, pp + n, n);
this_pp += n;
- mpn_redc_1_sec (this_pp, tp, mp, n, minv);
+ MPN_REDC_1_SEC (this_pp, tp, mp, n, minv);
}
expbits = getbits (ep, ebi, windowsize);
@@ -261,8 +284,15 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
else
ebi -= windowsize;
+#if WANT_CACHE_SECURITY
+ mpn_tabselect (rp, pp, n, 1 << windowsize, expbits);
+#else
MPN_COPY (rp, pp + n * expbits, n);
+#endif
+ /* Main exponentiation loop. */
+ /* scratch: | n | n | ... | | 3n-4n | */
+ /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | loop scratch | */
while (ebi != 0)
{
expbits = getbits (ep, ebi, windowsize);
@@ -278,7 +308,7 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
do
{
mpn_local_sqr (tp, rp, n, tp + 2 * n);
- mpn_redc_1_sec (rp, tp, mp, n, minv);
+ MPN_REDC_1_SEC (rp, tp, mp, n, minv);
this_windowsize--;
}
while (this_windowsize != 0);
@@ -289,52 +319,36 @@ mpn_powm_sec (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
#else
mpn_mul_basecase (tp, rp, n, pp + n * expbits, n);
#endif
- mpn_redc_1_sec (rp, tp, mp, n, minv);
+ MPN_REDC_1_SEC (rp, tp, mp, n, minv);
}
MPN_COPY (tp, rp, n);
MPN_ZERO (tp + n, n);
- mpn_redc_1_sec (rp, tp, mp, n, minv);
+ MPN_REDC_1_SEC (rp, tp, mp, n, minv);
cnd = mpn_sub_n (tp, rp, mp, n); /* we need just retval */
mpn_subcnd_n (rp, rp, mp, n, !cnd);
- TMP_FREE;
}
-#if ! HAVE_NATIVE_mpn_tabselect
-/* Select entry `which' from table `tab', which has nents entries, each `n'
- limbs. Store the selected entry at rp. Reads entire table to avoid
- side-channel information leaks. O(n*nents).
- FIXME: Move to its own file. */
-void
-mpn_tabselect (volatile mp_limb_t *rp, volatile mp_limb_t *tab, mp_size_t n,
- mp_size_t nents, mp_size_t which)
-{
- mp_size_t k, i;
- mp_limb_t mask;
- volatile mp_limb_t *tp;
-
- for (k = 0; k < nents; k++)
- {
- mask = -(mp_limb_t) (which == k);
- tp = tab + n * k;
- for (i = 0; i < n; i++)
- {
- rp[i] = (rp[i] & ~mask) | (tp[i] & mask);
- }
- }
-}
-#endif
-
mp_size_t
mpn_powm_sec_itch (mp_size_t bn, mp_size_t en, mp_size_t n)
{
int windowsize;
mp_size_t redcify_itch, itch;
- windowsize = win_size (en * GMP_NUMB_BITS); /* slight over-estimate of exp */
- itch = 4 * n + (n << windowsize);
- redcify_itch = 2 * bn + n + 1;
- /* The 6n is due to the placement of reduce scratch 6n into the start of the
- scratch area. */
- return MAX (itch, redcify_itch + 6 * n);
+ /* The top scratch usage will either be when reducing B in the 2nd redcify
+ call, or more typically n*2^windowsize + 3n or 4n, in the main loop. (It
+ is 3n or 4n depending on if we use mpn_local_sqr or a native
+ mpn_sqr_basecase. We assume 4n always for now.) */
+
+ windowsize = win_size (en * GMP_LIMB_BITS); /* slight over-estimate of exp */
+
+ /* The 2n term is due to pp[0] and pp[1] at the time of the 2nd redcify call,
+ the 2bn + n + 1 term is due to redcify's own usage. */
+ redcify_itch = (2 * n) + (2 * bn + n + 1);
+
+ /* The n * 2^windowsize term is due to the power table, the 4n term is due to
+ scratch needs of squaring/multiplication in the exponentiation loop. */
+ itch = (n << windowsize) + (4 * n);
+
+ return MAX (itch, redcify_itch);
}
diff --git a/mpn/generic/redc_1.c b/mpn/generic/redc_1.c
index 177f3932f..3567414eb 100644
--- a/mpn/generic/redc_1.c
+++ b/mpn/generic/redc_1.c
@@ -25,7 +25,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
void
-mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
+mpn_redc_1 (mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
{
mp_size_t j;
mp_limb_t cy;
@@ -40,7 +40,4 @@ mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
up[0] = cy;
up++;
}
- cy = mpn_add_n (rp, up, up - n, n);
- if (cy != 0)
- mpn_sub_n (rp, rp, mp, n);
}
diff --git a/mpn/generic/redc_1_sec.c b/mpn/generic/tabselect.c
index 3d914381c..02e52fdc0 100644
--- a/mpn/generic/redc_1_sec.c
+++ b/mpn/generic/tabselect.c
@@ -1,10 +1,9 @@
-/* mpn_redc_1_sec. Set cp[] <- up[]/R^n mod mp[]. Clobber up[].
- mp[] is n limbs; up[] is 2n limbs.
+/* mpn_tabselect.
THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
-Copyright (C) 2000, 2001, 2002, 2004, 2008, 2009 Free Software Foundation, Inc.
+Copyright 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -24,22 +23,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp.h"
#include "gmp-impl.h"
+
+/* Select entry `which' from table `tab', which has nents entries, each `n'
+ limbs. Store the selected entry at rp. Reads entire table to avoid
+ side-channel information leaks. O(n*nents).
+ FIXME: Move to its own file. */
void
-mpn_redc_1_sec (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
+mpn_tabselect (volatile mp_limb_t *rp, volatile mp_limb_t *tab, mp_size_t n,
+ mp_size_t nents, mp_size_t which)
{
- mp_size_t j;
- mp_limb_t cy;
-
- ASSERT (n > 0);
- ASSERT_MPN (up, 2*n);
+ mp_size_t k, i;
+ mp_limb_t mask;
+ volatile mp_limb_t *tp;
- for (j = n - 1; j >= 0; j--)
+ for (k = 0; k < nents; k++)
{
- cy = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK);
- ASSERT (up[0] == 0);
- up[0] = cy;
- up++;
+ mask = -(mp_limb_t) (which == k);
+ tp = tab + n * k;
+ for (i = 0; i < n; i++)
+ {
+ rp[i] = (rp[i] & ~mask) | (tp[i] & mask);
+ }
}
- cy = mpn_add_n (rp, up, up - n, n);
- mpn_subcnd_n (rp, rp, mp, n, cy);
}
diff --git a/mpn/generic/udiv_w_sdiv.c b/mpn/generic/udiv_w_sdiv.c
index c01f95847..ceefd1b5f 100644
--- a/mpn/generic/udiv_w_sdiv.c
+++ b/mpn/generic/udiv_w_sdiv.c
@@ -9,7 +9,7 @@
GNU MP RELEASE.
-Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+Copyright 1992, 1994, 1996, 2000, 2011 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -116,12 +116,12 @@ mpn_udiv_w_sdiv (rp, a1, a0, d)
{ /* Hence a1 = d - 1 = 2*b1 - 1 */
if (a0 >= -d)
{
- q = -1;
+ q = -CNST_LIMB(1);
r = a0 + d;
}
else
{
- q = -2;
+ q = -CNST_LIMB(2);
r = a0 + 2*d;
}
}
diff --git a/mpn/ia64/gmp-mparam.h b/mpn/ia64/gmp-mparam.h
index 0841c82aa..77e02f518 100644
--- a/mpn/ia64/gmp-mparam.h
+++ b/mpn/ia64/gmp-mparam.h
@@ -1,6 +1,6 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2009, 2010 Free Software
+Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2009, 2010, 2011 Free Software
Foundation, Inc.
This file is part of the GNU MP Library.
@@ -21,70 +21,94 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define GMP_LIMB_BITS 64
#define BYTES_PER_MP_LIMB 8
-/* 1300MHz Itanium2 (babe.fsffrance.org) */
-
+/* 900MHz Itanium2 (titanic.gmplib.org) */
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 26
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD 12
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
#define MUL_TOOM22_THRESHOLD 40
-#define MUL_TOOM33_THRESHOLD 122
-#define MUL_TOOM44_THRESHOLD 212
+#define MUL_TOOM33_THRESHOLD 129
+#define MUL_TOOM44_THRESHOLD 214
#define MUL_TOOM6H_THRESHOLD 318
#define MUL_TOOM8H_THRESHOLD 430
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 146
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 145
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 126
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 151
#define SQR_BASECASE_THRESHOLD 11
#define SQR_TOOM2_THRESHOLD 84
-#define SQR_TOOM3_THRESHOLD 125
+#define SQR_TOOM3_THRESHOLD 135
#define SQR_TOOM4_THRESHOLD 494
-#define SQR_TOOM6_THRESHOLD 0 /* never toom4 */
-#define SQR_TOOM8_THRESHOLD 0 /* never toom6 */
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 98
#define MULMOD_BNM1_THRESHOLD 23
-#define SQRMOD_BNM1_THRESHOLD 25
+#define SQRMOD_BNM1_THRESHOLD 28
+
+#define POWM_SEC_TABLE 2,29,130,905
-#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */
+#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
#define MUL_FFT_TABLE3 \
- { { 444, 5}, { 27, 6}, { 14, 5}, { 29, 6}, \
- { 35, 7}, { 18, 6}, { 37, 7}, { 19, 6}, \
+ { { 476, 5}, { 27, 6}, { 14, 5}, { 29, 6}, \
+ { 33, 7}, { 17, 6}, { 37, 7}, { 19, 6}, \
{ 39, 7}, { 21, 6}, { 43, 7}, { 33, 8}, \
{ 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \
- { 21, 7}, { 43, 8}, { 29, 9}, { 15, 8}, \
- { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
- { 49, 9}, { 27, 8}, { 57, 9}, { 31, 8}, \
- { 63, 9}, { 35, 8}, { 71, 9}, { 43,10}, \
+ { 21, 7}, { 43, 8}, { 37, 9}, { 19, 8}, \
+ { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \
+ { 57, 9}, { 31, 8}, { 63, 9}, { 43,10}, \
{ 23, 9}, { 59,10}, { 31, 9}, { 71,10}, \
- { 39, 9}, { 87,10}, { 47, 9}, { 99,10}, \
+ { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \
{ 55,11}, { 31,10}, { 87,11}, { 47,10}, \
{ 111,12}, { 31,11}, { 63,10}, { 143,11}, \
{ 79,10}, { 167,11}, { 95,10}, { 191,11}, \
{ 111,12}, { 63,11}, { 143,10}, { 287, 9}, \
{ 575,10}, { 303,11}, { 159,10}, { 319,12}, \
{ 95,11}, { 191,10}, { 399,11}, { 207,10}, \
- { 431,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 76
+ { 431,13}, { 63,12}, { 127,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
+ { 159,11}, { 335,10}, { 671,11}, { 367,12}, \
+ { 191,11}, { 399,10}, { 799,11}, { 431,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
+ { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 415,11}, { 863,12}, { 447,14}, { 127,13}, \
+ { 255,12}, { 607,13}, { 319,12}, { 735,13}, \
+ { 383,12}, { 799,11}, { 1599,12}, { 863,13}, \
+ { 447,12}, { 927,11}, { 1855,14}, { 255,13}, \
+ { 511,12}, { 1055,13}, { 575,12}, { 1215,13}, \
+ { 639,12}, { 1279,13}, { 703,14}, { 383,13}, \
+ { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \
+ { 895,12}, { 1791,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \
+ { 1599,12}, { 3199,13}, { 1663,14}, { 895,13}, \
+ { 1855,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,14}, { 1279,13}, { 2687,14}, \
+ { 1407,15}, { 767,14}, { 1535,13}, { 3199,14}, \
+ { 1663,13}, { 3455,14}, { 1791,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 155
#define MUL_FFT_THRESHOLD 5760
-#define SQR_FFT_MODF_THRESHOLD 440 /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD 436 /* k = 5 */
#define SQR_FFT_TABLE3 \
- { { 440, 5}, { 14, 4}, { 29, 5}, { 29, 6}, \
- { 15, 5}, { 31, 6}, { 35, 7}, { 18, 6}, \
- { 37, 7}, { 33, 8}, { 17, 7}, { 37, 8}, \
+ { { 436, 5}, { 14, 4}, { 29, 5}, { 31, 6}, \
+ { 35, 7}, { 18, 6}, { 37, 7}, { 37, 8}, \
{ 19, 7}, { 40, 8}, { 37, 9}, { 19, 8}, \
{ 43, 9}, { 23, 8}, { 49, 9}, { 27, 8}, \
{ 57, 9}, { 43,10}, { 23, 9}, { 55,10}, \
@@ -93,45 +117,69 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
{ 87,11}, { 47,10}, { 111,12}, { 31,11}, \
{ 63,10}, { 135,11}, { 79,10}, { 167,11}, \
{ 95,10}, { 191,11}, { 111,12}, { 63,11}, \
- { 127,10}, { 255,11}, { 143,10}, { 303,11}, \
- { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
- { 399,11}, { 207,10}, { 431,13}, { 8192,14}, \
- { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 66
+ { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \
+ { 575,10}, { 303,11}, { 159,10}, { 319,12}, \
+ { 95,11}, { 191,10}, { 399,11}, { 207,10}, \
+ { 431,13}, { 63,12}, { 127,11}, { 271,10}, \
+ { 543,11}, { 303,12}, { 159,11}, { 335,10}, \
+ { 671,11}, { 367,10}, { 735,12}, { 191,11}, \
+ { 399,10}, { 799,11}, { 431,12}, { 223,11}, \
+ { 463,13}, { 127,12}, { 255,11}, { 543,12}, \
+ { 287,11}, { 607,12}, { 319,11}, { 671,12}, \
+ { 351,11}, { 735,13}, { 191,12}, { 383,11}, \
+ { 799,12}, { 415,11}, { 863,12}, { 447,11}, \
+ { 895,14}, { 127,13}, { 255,12}, { 543,11}, \
+ { 1087,12}, { 607,13}, { 319,12}, { 735,13}, \
+ { 383,12}, { 863,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1183,13}, { 639,12}, { 1279,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \
+ { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \
+ { 2175,13}, { 1215,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1471,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1919,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \
+ { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \
+ { 3199,14}, { 1663,13}, { 3455,14}, { 1791,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 151
#define SQR_FFT_THRESHOLD 4032
#define MULLO_BASECASE_THRESHOLD 29
#define MULLO_DC_THRESHOLD 57
#define MULLO_MUL_N_THRESHOLD 11278
-#define DC_DIV_QR_THRESHOLD 59
+#define DC_DIV_QR_THRESHOLD 64
#define DC_DIVAPPR_Q_THRESHOLD 222
#define DC_BDIV_QR_THRESHOLD 95
#define DC_BDIV_Q_THRESHOLD 264
-#define INV_MULMOD_BNM1_THRESHOLD 82
-#define INV_NEWTON_THRESHOLD 11
-#define INV_APPR_THRESHOLD 18
+#define INV_MULMOD_BNM1_THRESHOLD 86
+#define INV_NEWTON_THRESHOLD 139
+#define INV_APPR_THRESHOLD 147
#define BINV_NEWTON_THRESHOLD 252
-#define REDC_1_TO_REDC_2_THRESHOLD 0
+#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */
#define REDC_2_TO_REDC_N_THRESHOLD 147
#define MU_DIV_QR_THRESHOLD 1142
-#define MU_DIVAPPR_Q_THRESHOLD 998
+#define MU_DIVAPPR_Q_THRESHOLD 1142
#define MUPI_DIV_QR_THRESHOLD 0 /* always */
-#define MU_BDIV_QR_THRESHOLD 1187
+#define MU_BDIV_QR_THRESHOLD 1210
#define MU_BDIV_Q_THRESHOLD 1470
#define MATRIX22_STRASSEN_THRESHOLD 23
#define HGCD_THRESHOLD 117
-#define GCD_DC_THRESHOLD 469
+#define HGCD_APPR_THRESHOLD 111
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 555
#define GCDEXT_DC_THRESHOLD 368
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 1204
-#define SET_STR_PRECOMPUTE_THRESHOLD 3266
+#define GET_STR_PRECOMPUTE_THRESHOLD 22
+#define SET_STR_DC_THRESHOLD 1474
+#define SET_STR_PRECOMPUTE_THRESHOLD 3168
diff --git a/mpn/ia64/tabselect.asm b/mpn/ia64/tabselect.asm
new file mode 100644
index 000000000..cc5b49b04
--- /dev/null
+++ b/mpn/ia64/tabselect.asm
@@ -0,0 +1,139 @@
+dnl IA-64 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: ?
+C Itanium 2: 2.5
+
+C NOTES
+C * Using software pipelining could trivially yield 2 c/l without unrolling,
+C or 1+epsilon with unrolling. (This code was modelled after the powerpc64
+C code, for simplicity.)
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `r32')
+define(`tp', `r33')
+define(`n', `r34')
+define(`nents', `r35')
+define(`which', `r36')
+
+define(`mask', `r8')
+
+define(`rp1', `r32')
+define(`tp1', `r33')
+define(`rp2', `r14')
+define(`tp2', `r15')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',`
+.mmi; addp4 rp = 0, rp C M I
+ addp4 tp = 0, tp C M I
+ zxt4 n = n C I
+.mii; nop 0
+ zxt4 nents = nents C I
+ zxt4 which = which C I
+ ;;
+')
+.mmi; add rp2 = 8, rp1
+ add tp2 = 8, tp1
+ add r6 = -2, n
+ ;;
+.mmi; cmp.eq p10, p0 = 1, n
+ and r9 = 1, n C set cr0 for use in inner loop
+ shr.u r6 = r6, 1 C inner loop count
+ ;;
+.mmi; cmp.eq p8, p0 = 0, r9
+ sub which = nents, which
+ shl n = n, 3
+ ;;
+
+L(outer):
+.mmi cmp.eq p6, p7 = which, nents C are we at the selected table entry?
+ nop 0
+ mov ar.lc = r6 C I0
+ ;;
+.mmb;
+ (p6) mov mask = -1
+ (p7) mov mask = 0
+ (p8) br.dptk L(top) C branch to loop entry if n even
+ ;;
+
+.mmi; ld8 r16 = [tp1], 8
+ add tp2 = 8, tp2
+ nop 0
+ ;;
+.mmi; ld8 r18 = [rp1]
+ and r16 = r16, mask
+ nop 0
+ ;;
+.mmi; andcm r18 = r18, mask
+ ;;
+ or r16 = r16, r18
+ nop 0
+ ;;
+.mmb; st8 [rp1] = r16, 8
+ add rp2 = 8, rp2
+ (p10) br.dpnt L(end)
+
+ ALIGN(32)
+L(top):
+.mmi; ld8 r16 = [tp1], 16
+ ld8 r17 = [tp2], 16
+ nop 0
+ ;;
+.mmi; ld8 r18 = [rp1]
+ and r16 = r16, mask
+ nop 0
+.mmi; ld8 r19 = [rp2]
+ and r17 = r17, mask
+ nop 0
+ ;;
+.mmi; andcm r18 = r18, mask
+ andcm r19 = r19, mask
+ nop 0
+ ;;
+.mmi; or r16 = r16, r18
+ or r17 = r17, r19
+ nop 0
+ ;;
+.mmb; st8 [rp1] = r16, 16
+ st8 [rp2] = r17, 16
+ br.cloop.dptk L(top)
+ ;;
+L(end):
+.mmi; sub rp1 = rp1, n C move rp back to beginning
+ sub rp2 = rp2, n C move rp back to beginning
+ cmp.ne p9, p0 = 1, nents
+.mmb; add nents = -1, nents
+ nop 0
+ (p9) br.dptk L(outer)
+ ;;
+
+.mib; nop 0
+ nop 0
+ br.ret.sptk.many b0
+EPILOGUE()
diff --git a/mpn/pa64/gmp-mparam.h b/mpn/pa64/gmp-mparam.h
index d0e86d856..081757aca 100644
--- a/mpn/pa64/gmp-mparam.h
+++ b/mpn/pa64/gmp-mparam.h
@@ -25,14 +25,16 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIVREM_1_NORM_THRESHOLD 0 /* always */
#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
#define MOD_1U_TO_MOD_1_1_THRESHOLD 10
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 14
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD 21
#define DIVEXACT_1_THRESHOLD 0 /* always */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
@@ -47,16 +49,20 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 54
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 56
-#define SQR_TOOM3_THRESHOLD 169
-#define SQR_TOOM4_THRESHOLD 280
-#define SQR_TOOM6_THRESHOLD 0
-#define SQR_TOOM8_THRESHOLD 309
+#define SQR_BASECASE_THRESHOLD 5
+#define SQR_TOOM2_THRESHOLD 58
+#define SQR_TOOM3_THRESHOLD 153
+#define SQR_TOOM4_THRESHOLD 278
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 0 /* always */
-#define MULMOD_BNM1_THRESHOLD 16
+#define MULMID_TOOM42_THRESHOLD 56
+
+#define MULMOD_BNM1_THRESHOLD 15
#define SQRMOD_BNM1_THRESHOLD 19
+#define POWM_SEC_TABLE 2,23,228,1084
+
#define MUL_FFT_MODF_THRESHOLD 336 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 336, 5}, { 11, 4}, { 23, 5}, { 21, 6}, \
@@ -196,34 +202,36 @@ with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 1856
#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 133
-#define MULLO_MUL_N_THRESHOLD 4292
+#define MULLO_DC_THRESHOLD 113
+#define MULLO_MUL_N_THRESHOLD 4658
-#define DC_DIV_QR_THRESHOLD 140
-#define DC_DIVAPPR_Q_THRESHOLD 422
-#define DC_BDIV_QR_THRESHOLD 150
-#define DC_BDIV_Q_THRESHOLD 351
+#define DC_DIV_QR_THRESHOLD 123
+#define DC_DIVAPPR_Q_THRESHOLD 372
+#define DC_BDIV_QR_THRESHOLD 142
+#define DC_BDIV_Q_THRESHOLD 312
-#define INV_MULMOD_BNM1_THRESHOLD 60
-#define INV_NEWTON_THRESHOLD 348
-#define INV_APPR_THRESHOLD 324
+#define INV_MULMOD_BNM1_THRESHOLD 58
+#define INV_NEWTON_THRESHOLD 315
+#define INV_APPR_THRESHOLD 315
-#define BINV_NEWTON_THRESHOLD 363
+#define BINV_NEWTON_THRESHOLD 360
#define REDC_1_TO_REDC_N_THRESHOLD 101
-#define MU_DIV_QR_THRESHOLD 998
+#define MU_DIV_QR_THRESHOLD 979
#define MU_DIVAPPR_Q_THRESHOLD 1142
-#define MUPI_DIV_QR_THRESHOLD 110
+#define MUPI_DIV_QR_THRESHOLD 93
#define MU_BDIV_QR_THRESHOLD 889
-#define MU_BDIV_Q_THRESHOLD 1334
+#define MU_BDIV_Q_THRESHOLD 1187
#define MATRIX22_STRASSEN_THRESHOLD 9
-#define HGCD_THRESHOLD 242
-#define GCD_DC_THRESHOLD 752
-#define GCDEXT_DC_THRESHOLD 545
+#define HGCD_THRESHOLD 234
+#define HGCD_APPR_THRESHOLD 300
+#define HGCD_REDUCE_THRESHOLD 1553
+#define GCD_DC_THRESHOLD 684
+#define GCDEXT_DC_THRESHOLD 525
#define JACOBI_BASE_METHOD 2
#define GET_STR_DC_THRESHOLD 21
#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 2008
-#define SET_STR_PRECOMPUTE_THRESHOLD 4066
+#define SET_STR_DC_THRESHOLD 1951
+#define SET_STR_PRECOMPUTE_THRESHOLD 4034
diff --git a/mpn/powerpc32/aors_n.asm b/mpn/powerpc32/aors_n.asm
index f9e9b50d5..12115a9e9 100644
--- a/mpn/powerpc32/aors_n.asm
+++ b/mpn/powerpc32/aors_n.asm
@@ -19,14 +19,17 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C 603e: ?
-C 604e: ? old: 3.25
-C 75x (G3): ? old: 3.5
-C 7400,7410 (G4): 3.25
-C 744x,745x (G4+): 4
-C power4/ppc970: ? old: 2.0
-C power5: ? old: 2.5
+C cycles/limb
+C 603e: ?
+C 604e: ? old: 3.25
+C 75x (G3): ? old: 3.5
+C 7400,7410 (G4): 3.25
+C 744x,745x (G4+): 4
+C POWER3/PPC630 2
+C POWER4/PPC970 2.4
+C POWER5 2.75
+C POWER6 40-140
+C POWER7 3
C INPUT PARAMETERS
define(`rp', `r3')
diff --git a/mpn/powerpc32/p3-p7/aors_n.asm b/mpn/powerpc32/p3-p7/aors_n.asm
new file mode 100644
index 000000000..6999182a8
--- /dev/null
+++ b/mpn/powerpc32/p3-p7/aors_n.asm
@@ -0,0 +1,176 @@
+dnl PowerPC-32 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 1.5
+C POWER4/PPC970 2
+C POWER5 2
+C POWER6 2.78
+C POWER7 2.15-2.87
+
+C This code is based on powerpc64/aors_n.asm.
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C vp r5
+C n r6
+
+ifdef(`OPERATION_add_n',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)
+ define(GENRVAL, `addi r3, r3, 1')
+ define(SETCBR, `addic r0, $1, -1')
+ define(CLRCB, `addic r0, r0, 0')
+')
+ifdef(`OPERATION_sub_n',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)
+ define(GENRVAL, `neg r3, r3')
+ define(SETCBR, `subfic r0, $1, 0')
+ define(CLRCB, `addic r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+ SETCBR(r7)
+ b L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+ CLRCB
+L(ent): stw r31, -4(r1)
+ stw r30, -8(r1)
+ stw r29, -12(r1)
+ stw r28, -16(r1)
+
+ rlwinm. r0, r6, 0,30,31 C r0 = n & 3, set cr0
+ cmpwi cr6, r0, 2
+ addi r6, r6, 3 C compute count...
+ srwi r6, r6, 2 C ...for ctr
+ mtctr r6 C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): lwz r8, 0(r4) C load s1 limb
+ lwz r9, 0(r5) C load s2 limb
+ lwz r10, 4(r4) C load s1 limb
+ lwz r11, 4(r5) C load s2 limb
+ lwz r12, 8(r4) C load s1 limb
+ addi r4, r4, 12
+ lwz r0, 8(r5) C load s2 limb
+ addi r5, r5, 12
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ stw r29, 0(r3)
+ stw r30, 4(r3)
+ stw r31, 8(r3)
+ addi r3, r3, 12
+ bdnz L(go)
+ b L(ret)
+
+L(b01): lwz r12, 0(r4) C load s1 limb
+ addi r4, r4, 4
+ lwz r0, 0(r5) C load s2 limb
+ addi r5, r5, 4
+ ADDSUBC r31, r0, r12 C add
+ stw r31, 0(r3)
+ addi r3, r3, 4
+ bdnz L(go)
+ b L(ret)
+
+L(b10): lwz r10, 0(r4) C load s1 limb
+ lwz r11, 0(r5) C load s2 limb
+ lwz r12, 4(r4) C load s1 limb
+ addi r4, r4, 8
+ lwz r0, 4(r5) C load s2 limb
+ addi r5, r5, 8
+ ADDSUBC r30, r11, r10 C add
+ ADDSUBC r31, r0, r12 C add
+ stw r30, 0(r3)
+ stw r31, 4(r3)
+ addi r3, r3, 8
+ bdnz L(go)
+ b L(ret)
+
+L(b00): C INITCY C clear/set cy
+L(go): lwz r6, 0(r4) C load s1 limb
+ lwz r7, 0(r5) C load s2 limb
+ lwz r8, 4(r4) C load s1 limb
+ lwz r9, 4(r5) C load s2 limb
+ lwz r10, 8(r4) C load s1 limb
+ lwz r11, 8(r5) C load s2 limb
+ lwz r12, 12(r4) C load s1 limb
+ lwz r0, 12(r5) C load s2 limb
+ bdz L(end)
+
+ addi r4, r4, 16
+ addi r5, r5, 16
+
+ ALIGN(16)
+L(top): ADDSUBC r28, r7, r6
+ lwz r6, 0(r4) C load s1 limb
+ lwz r7, 0(r5) C load s2 limb
+ ADDSUBC r29, r9, r8
+ lwz r8, 4(r4) C load s1 limb
+ lwz r9, 4(r5) C load s2 limb
+ ADDSUBC r30, r11, r10
+ lwz r10, 8(r4) C load s1 limb
+ lwz r11, 8(r5) C load s2 limb
+ ADDSUBC r31, r0, r12
+ lwz r12, 12(r4) C load s1 limb
+ lwz r0, 12(r5) C load s2 limb
+ stw r28, 0(r3)
+ addi r4, r4, 16
+ stw r29, 4(r3)
+ addi r5, r5, 16
+ stw r30, 8(r3)
+ stw r31, 12(r3)
+ addi r3, r3, 16
+ bdnz L(top) C decrement ctr and loop back
+
+L(end): ADDSUBC r28, r7, r6
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ stw r28, 0(r3)
+ stw r29, 4(r3)
+ stw r30, 8(r3)
+ stw r31, 12(r3)
+
+L(ret): lwz r31, -4(r1)
+ lwz r30, -8(r1)
+ lwz r29, -12(r1)
+ lwz r28, -16(r1)
+
+ subfe r3, r0, r0 C -cy
+ GENRVAL
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc32/p5/gmp-mparam.h b/mpn/powerpc32/p5/gmp-mparam.h
index a8400ce65..ba210ecc4 100644
--- a/mpn/powerpc32/p5/gmp-mparam.h
+++ b/mpn/powerpc32/p5/gmp-mparam.h
@@ -30,114 +30,117 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 46
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 50
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18
#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 62
+#define BMOD_1_TO_MOD_1_THRESHOLD 61
#define MUL_TOOM22_THRESHOLD 22
-#define MUL_TOOM33_THRESHOLD 78
+#define MUL_TOOM33_THRESHOLD 57
#define MUL_TOOM44_THRESHOLD 130
-#define MUL_TOOM6H_THRESHOLD 206
-#define MUL_TOOM8H_THRESHOLD 260
+#define MUL_TOOM6H_THRESHOLD 189
+#define MUL_TOOM8H_THRESHOLD 309
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 83
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 42
+#define SQR_BASECASE_THRESHOLD 6
+#define SQR_TOOM2_THRESHOLD 40
#define SQR_TOOM3_THRESHOLD 77
-#define SQR_TOOM4_THRESHOLD 169
-#define SQR_TOOM6_THRESHOLD 246
-#define SQR_TOOM8_THRESHOLD 381
+#define SQR_TOOM4_THRESHOLD 124
+#define SQR_TOOM6_THRESHOLD 140
+#define SQR_TOOM8_THRESHOLD 238
+
+#define MULMID_TOOM42_THRESHOLD 40
#define MULMOD_BNM1_THRESHOLD 15
-#define SQRMOD_BNM1_THRESHOLD 18
+#define SQRMOD_BNM1_THRESHOLD 16
+
+#define POWM_SEC_TABLE 4,29,252,840,2080
-#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
+#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */
#define MUL_FFT_TABLE3 \
- { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
- { 13, 5}, { 27, 6}, { 21, 7}, { 11, 6}, \
- { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
- { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
- { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
- { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
- { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
- { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
- { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \
- { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
- { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
- { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \
- { 575,10}, { 159,11}, { 95, 9}, { 383,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \
- { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
- { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \
- { 767,10}, { 415, 9}, { 831,11}, { 223,12}, \
- { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 76
+ { { 412, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 55,10}, { 31, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
+ { 95,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
+ { 159,10}, { 335, 9}, { 671,10}, { 351, 9}, \
+ { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \
+ { 415, 9}, { 831,11}, { 223,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 71
#define MUL_FFT_THRESHOLD 4736
-#define SQR_FFT_MODF_THRESHOLD 316 /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
#define SQR_FFT_TABLE3 \
- { { 316, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
{ 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
- { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
- { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
- { 19, 6}, { 77, 7}, { 39, 8}, { 23, 7}, \
- { 47, 8}, { 27, 9}, { 15, 8}, { 39, 9}, \
- { 23, 8}, { 47,10}, { 15, 7}, { 121, 9}, \
- { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
- { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \
- { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
- { 79, 9}, { 159, 8}, { 319, 9}, { 175,10}, \
- { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \
+ { 27, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 47,10}, { 31, 9}, \
+ { 71,10}, { 47,11}, { 31,10}, { 63, 9}, \
+ { 127, 8}, { 255, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
{ 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
{ 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \
- { 159, 9}, { 319,10}, { 175,11}, { 95,10}, \
- { 191, 9}, { 383,10}, { 207,12}, { 63,11}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
{ 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
{ 543,10}, { 287, 9}, { 575,10}, { 303,11}, \
{ 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
{ 671,10}, { 351,11}, { 191,10}, { 383, 9}, \
{ 767,10}, { 415,11}, { 223,10}, { 447,12}, \
{ 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 88
+#define SQR_FFT_TABLE3_SIZE 76
#define SQR_FFT_THRESHOLD 3712
#define MULLO_BASECASE_THRESHOLD 2
#define MULLO_DC_THRESHOLD 68
#define MULLO_MUL_N_THRESHOLD 9236
-#define DC_DIV_QR_THRESHOLD 70
-#define DC_DIVAPPR_Q_THRESHOLD 238
+#define DC_DIV_QR_THRESHOLD 69
+#define DC_DIVAPPR_Q_THRESHOLD 220
#define DC_BDIV_QR_THRESHOLD 75
#define DC_BDIV_Q_THRESHOLD 188
#define INV_MULMOD_BNM1_THRESHOLD 54
-#define INV_NEWTON_THRESHOLD 250
-#define INV_APPR_THRESHOLD 246
+#define INV_NEWTON_THRESHOLD 230
+#define INV_APPR_THRESHOLD 230
-#define BINV_NEWTON_THRESHOLD 375
+#define BINV_NEWTON_THRESHOLD 278
#define REDC_1_TO_REDC_N_THRESHOLD 87
-#define MU_DIV_QR_THRESHOLD 1334
-#define MU_DIVAPPR_Q_THRESHOLD 1387
-#define MUPI_DIV_QR_THRESHOLD 114
-#define MU_BDIV_QR_THRESHOLD 1078
-#define MU_BDIV_Q_THRESHOLD 1334
+#define MU_DIV_QR_THRESHOLD 1210
+#define MU_DIVAPPR_Q_THRESHOLD 1308
+#define MUPI_DIV_QR_THRESHOLD 106
+#define MU_BDIV_QR_THRESHOLD 1017
+#define MU_BDIV_Q_THRESHOLD 1210
#define MATRIX22_STRASSEN_THRESHOLD 14
-#define HGCD_THRESHOLD 104
-#define GCD_DC_THRESHOLD 424
-#define GCDEXT_DC_THRESHOLD 321
+#define HGCD_THRESHOLD 110
+#define HGCD_APPR_THRESHOLD 138
+#define HGCD_REDUCE_THRESHOLD 2578
+#define GCD_DC_THRESHOLD 408
+#define GCDEXT_DC_THRESHOLD 298
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 454
-#define SET_STR_PRECOMPUTE_THRESHOLD 1074
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_DC_THRESHOLD 527
+#define SET_STR_PRECOMPUTE_THRESHOLD 1090
diff --git a/mpn/powerpc32/p6/gmp-mparam.h b/mpn/powerpc32/p6/gmp-mparam.h
index 73951d0ae..529a66d19 100644
--- a/mpn/powerpc32/p6/gmp-mparam.h
+++ b/mpn/powerpc32/p6/gmp-mparam.h
@@ -29,115 +29,127 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_NORM_THRESHOLD 3
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 15
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MUL_TOOM22_THRESHOLD 34
-#define MUL_TOOM33_THRESHOLD 70
-#define MUL_TOOM44_THRESHOLD 187
-#define MUL_TOOM6H_THRESHOLD 286
-#define MUL_TOOM8H_THRESHOLD 321
+#define MUL_TOOM22_THRESHOLD 19
+#define MUL_TOOM33_THRESHOLD 55
+#define MUL_TOOM44_THRESHOLD 88
+#define MUL_TOOM6H_THRESHOLD 137
+#define MUL_TOOM8H_THRESHOLD 181
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 110
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 118
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 107
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 56
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 57
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56
#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 68
-#define SQR_TOOM3_THRESHOLD 113
-#define SQR_TOOM4_THRESHOLD 312
-#define SQR_TOOM6_THRESHOLD 330
-#define SQR_TOOM8_THRESHOLD 357
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 56
+#define SQR_TOOM4_THRESHOLD 130
+#define SQR_TOOM6_THRESHOLD 189
+#define SQR_TOOM8_THRESHOLD 296
-#define MULMOD_BNM1_THRESHOLD 19
-#define SQRMOD_BNM1_THRESHOLD 20
+#define MULMID_TOOM42_THRESHOLD 26
-#define MUL_FFT_MODF_THRESHOLD 304 /* k = 5 */
+#define MULMOD_BNM1_THRESHOLD 7
+#define SQRMOD_BNM1_THRESHOLD 12
+
+#define POWM_SEC_TABLE 2,26,127,453,1068
+
+#define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */
#define MUL_FFT_TABLE3 \
- { { 304, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 17, 7}, { 9, 6}, \
- { 20, 7}, { 11, 6}, { 24, 7}, { 13, 8}, \
- { 7, 7}, { 21, 8}, { 11, 7}, { 27, 9}, \
- { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \
- { 41, 8}, { 23, 7}, { 47, 8}, { 27, 9}, \
+ { { 212, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
+ { 13, 7}, { 7, 6}, { 16, 7}, { 9, 6}, \
+ { 19, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
+ { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \
+ { 31, 8}, { 19, 7}, { 39, 8}, { 23, 9}, \
{ 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
- { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
- { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \
- { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
- { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \
- { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \
- { 191, 8}, { 383,11}, { 63,10}, { 127, 9}, \
- { 255, 8}, { 511, 9}, { 271,10}, { 143, 9}, \
- { 287,10}, { 159, 9}, { 319,11}, { 95,10}, \
- { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \
- { 287,11}, { 159,10}, { 319, 9}, { 639,10}, \
- { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \
- { 415,11}, { 223,10}, { 447,12}, { 4096,13}, \
- { 8192,14}, { 16384,15}, { 32768,16} }
-#define MUL_FFT_TABLE3_SIZE 83
-#define MUL_FFT_THRESHOLD 4736
-
-#define SQR_FFT_MODF_THRESHOLD 312 /* k = 5 */
-#define SQR_FFT_TABLE3 \
- { { 312, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
- { 27, 7}, { 17, 6}, { 35, 7}, { 21, 8}, \
- { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
- { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
- { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
- { 47,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
- { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \
+ { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \
+ { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \
+ { 127, 9}, { 71, 8}, { 143, 7}, { 287, 9}, \
{ 79,10}, { 47,11}, { 31,10}, { 63, 9}, \
- { 127, 8}, { 255,10}, { 79, 9}, { 159, 8}, \
- { 319,10}, { 95, 9}, { 191,11}, { 63,10}, \
- { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
+ { 127, 8}, { 255, 7}, { 511, 9}, { 143, 8}, \
+ { 287,10}, { 79, 9}, { 159, 8}, { 319, 9}, \
+ { 175, 8}, { 351,10}, { 95, 9}, { 191, 8}, \
+ { 383, 9}, { 207,10}, { 111,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \
+ { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \
+ { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \
+ { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \
+ { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \
+ { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \
+ { 447,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 89
+#define MUL_FFT_THRESHOLD 1728
+
+#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 184, 5}, { 6, 4}, { 13, 5}, { 13, 6}, \
+ { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \
+ { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \
+ { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \
+ { 11, 7}, { 23, 9}, { 7, 8}, { 23, 9}, \
+ { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
+ { 47,10}, { 31, 9}, { 63, 8}, { 127, 7}, \
+ { 255, 9}, { 71, 8}, { 143, 7}, { 287, 6}, \
+ { 575, 9}, { 79,10}, { 47,11}, { 31,10}, \
+ { 63, 9}, { 127, 8}, { 255, 9}, { 143, 8}, \
+ { 287, 7}, { 575,10}, { 79, 9}, { 159, 8}, \
+ { 319, 9}, { 175, 8}, { 351,10}, { 95, 9}, \
+ { 191, 8}, { 383, 9}, { 207,10}, { 111, 9}, \
+ { 223,11}, { 63,10}, { 127, 9}, { 255,10}, \
{ 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \
- { 319,11}, { 95,10}, { 191, 9}, { 383,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \
- { 159,10}, { 319, 9}, { 639,10}, { 351,11}, \
- { 191,10}, { 383, 9}, { 767,10}, { 415,11}, \
- { 223,10}, { 447,12}, { 4096,13}, { 8192,14}, \
- { 16384,15}, { 32768,16} }
-#define SQR_FFT_TABLE3_SIZE 78
-#define SQR_FFT_THRESHOLD 2752
-
-#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 151
-#define MULLO_MUL_N_THRESHOLD 1175
-
-#define DC_DIV_QR_THRESHOLD 133
-#define DC_DIVAPPR_Q_THRESHOLD 442
-#define DC_BDIV_QR_THRESHOLD 130
-#define DC_BDIV_Q_THRESHOLD 324
-
-#define INV_MULMOD_BNM1_THRESHOLD 116
-#define INV_NEWTON_THRESHOLD 507
-#define INV_APPR_THRESHOLD 454
-
-#define BINV_NEWTON_THRESHOLD 507
-#define REDC_1_TO_REDC_N_THRESHOLD 118
-
-#define MU_DIV_QR_THRESHOLD 1652
-#define MU_DIVAPPR_Q_THRESHOLD 1752
-#define MUPI_DIV_QR_THRESHOLD 225
-#define MU_BDIV_QR_THRESHOLD 762
-#define MU_BDIV_Q_THRESHOLD 1017
-
-#define MATRIX22_STRASSEN_THRESHOLD 28
-#define HGCD_THRESHOLD 76
-#define GCD_DC_THRESHOLD 333
-#define GCDEXT_DC_THRESHOLD 245
+ { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \
+ { 191, 9}, { 383,10}, { 207, 9}, { 415,10}, \
+ { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 351, 9}, { 703, 8}, { 1407,11}, { 191,10}, \
+ { 415,11}, { 223,10}, { 447, 9}, { 895,12}, \
+ { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 92
+#define SQR_FFT_THRESHOLD 1600
+
+#define MULLO_BASECASE_THRESHOLD 2
+#define MULLO_DC_THRESHOLD 57
+#define MULLO_MUL_N_THRESHOLD 3176
+
+#define DC_DIV_QR_THRESHOLD 52
+#define DC_DIVAPPR_Q_THRESHOLD 187
+#define DC_BDIV_QR_THRESHOLD 64
+#define DC_BDIV_Q_THRESHOLD 146
+
+#define INV_MULMOD_BNM1_THRESHOLD 68
+#define INV_NEWTON_THRESHOLD 182
+#define INV_APPR_THRESHOLD 182
+
+#define BINV_NEWTON_THRESHOLD 186
+#define REDC_1_TO_REDC_N_THRESHOLD 60
+
+#define MU_DIV_QR_THRESHOLD 924
+#define MU_DIVAPPR_Q_THRESHOLD 807
+#define MUPI_DIV_QR_THRESHOLD 73
+#define MU_BDIV_QR_THRESHOLD 667
+#define MU_BDIV_Q_THRESHOLD 823
+
+#define MATRIX22_STRASSEN_THRESHOLD 8
+#define HGCD_THRESHOLD 61
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 974
+#define GCD_DC_THRESHOLD 195
+#define GCDEXT_DC_THRESHOLD 134
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 10
-#define GET_STR_PRECOMPUTE_THRESHOLD 20
-#define SET_STR_DC_THRESHOLD 199
-#define SET_STR_PRECOMPUTE_THRESHOLD 478
+#define GET_STR_DC_THRESHOLD 9
+#define GET_STR_PRECOMPUTE_THRESHOLD 21
+#define SET_STR_DC_THRESHOLD 190
+#define SET_STR_PRECOMPUTE_THRESHOLD 411
diff --git a/mpn/powerpc32/p7/gmp-mparam.h b/mpn/powerpc32/p7/gmp-mparam.h
new file mode 100644
index 000000000..bd18d4042
--- /dev/null
+++ b/mpn/powerpc32/p7/gmp-mparam.h
@@ -0,0 +1,149 @@
+/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2004, 2008, 2009,
+2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* 3550 MHz POWER7/T4 */
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 1
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15
+#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD 34
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 89
+#define MUL_TOOM44_THRESHOLD 130
+#define MUL_TOOM6H_THRESHOLD 286
+#define MUL_TOOM8H_THRESHOLD 363
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+
+#define SQR_BASECASE_THRESHOLD 4
+#define SQR_TOOM2_THRESHOLD 50
+#define SQR_TOOM3_THRESHOLD 89
+#define SQR_TOOM4_THRESHOLD 154
+#define SQR_TOOM6_THRESHOLD 222
+#define SQR_TOOM8_THRESHOLD 381
+
+#define MULMID_TOOM42_THRESHOLD 40
+
+#define MULMOD_BNM1_THRESHOLD 18
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 4,35,225,780,2212
+
+#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 14, 5}, { 29, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 159,11}, { 95,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \
+ { 1087,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335, 9}, { 671, 8}, { 1343,10}, { 351,11}, \
+ { 191,10}, { 415, 9}, { 831,10}, { 431,11}, \
+ { 223,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 77
+#define MUL_FFT_THRESHOLD 5312
+
+#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 344, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
+ { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
+ { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543, 8}, { 1087,10}, { 287, 9}, { 575,10}, \
+ { 303,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335, 9}, { 671,10}, { 351, 9}, { 703,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 415, 9}, \
+ { 831,11}, { 223,10}, { 447,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 79
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 2
+#define MULLO_DC_THRESHOLD 34
+#define MULLO_MUL_N_THRESHOLD 10323
+
+#define DC_DIV_QR_THRESHOLD 52
+#define DC_DIVAPPR_Q_THRESHOLD 202
+#define DC_BDIV_QR_THRESHOLD 68
+#define DC_BDIV_Q_THRESHOLD 152
+
+#define INV_MULMOD_BNM1_THRESHOLD 66
+#define INV_NEWTON_THRESHOLD 226
+#define INV_APPR_THRESHOLD 189
+
+#define BINV_NEWTON_THRESHOLD 292
+#define REDC_1_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 1442
+#define MU_DIVAPPR_Q_THRESHOLD 1442
+#define MUPI_DIV_QR_THRESHOLD 91
+#define MU_BDIV_QR_THRESHOLD 1308
+#define MU_BDIV_Q_THRESHOLD 1442
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD_THRESHOLD 126
+#define HGCD_APPR_THRESHOLD 139
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 573
+#define GCDEXT_DC_THRESHOLD 448
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 9
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 834
+#define SET_STR_PRECOMPUTE_THRESHOLD 1888
diff --git a/mpn/powerpc32/tabselect.asm b/mpn/powerpc32/tabselect.asm
new file mode 100644
index 000000000..155a7b495
--- /dev/null
+++ b/mpn/powerpc32/tabselect.asm
@@ -0,0 +1,98 @@
+dnl PowerPC-32 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C 603e: ?
+C 604e: ?
+C 75x (G3): ?
+C 7400,7410 (G4): ?
+C 744x,745x (G4+): ?
+C power4/ppc970: 3.3
+C power5: ?
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using VMX could result in significant speedup for certain CPUs.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `r3')
+define(`tp', `r4')
+define(`n', `r5')
+define(`nents', `r6')
+define(`which', `r7')
+
+define(`mask', `r8')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ addi r0, n, 1
+ srwi r0, r0, 1 C inner loop count
+ andi. r9, n, 1 C set cr0 for use in inner loop
+ subf which, nents, which
+ slwi n, n, 2
+
+L(outer):
+ mtctr r0 C put inner loop count in ctr
+
+ add r9, which, nents C are we at the selected table entry?
+ addic r9, r9, -1 C set CF iff not selected entry
+ subfe mask, r0, r0
+
+ beq cr0, L(top) C branch to loop entry if n even
+
+ lwz r9, 0(tp)
+ addi tp, tp, 4
+ and r9, r9, mask
+ lwz r11, 0(rp)
+ andc r11, r11, mask
+ or r9, r9, r11
+ stw r9, 0(rp)
+ addi rp, rp, 4
+ bdz L(end)
+
+ ALIGN(16)
+L(top): lwz r9, 0(tp)
+ lwz r10, 4(tp)
+ addi tp, tp, 8
+ nop
+ and r9, r9, mask
+ and r10, r10, mask
+ lwz r11, 0(rp)
+ lwz r12, 4(rp)
+ andc r11, r11, mask
+ andc r12, r12, mask
+ or r9, r9, r11
+ or r10, r10, r12
+ stw r9, 0(rp)
+ stw r10, 4(rp)
+ addi rp, rp, 8
+ bdnz L(top)
+
+L(end): subf rp, n, rp C move rp back to beginning
+ cmpwi cr6, nents, 1
+ addi nents, nents, -1
+ bne cr6, L(outer)
+
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc64/com.asm b/mpn/powerpc64/com.asm
index 4fb2e65d7..cb89bade2 100644
--- a/mpn/powerpc64/com.asm
+++ b/mpn/powerpc64/com.asm
@@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1?
-C POWER4/PPC970: 1.6
+C cycles/limb
+C POWER3/PPC630 1?
+C POWER4/PPC970 1.6
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.45
C TODO
C * 8-way unrolling brings timing down to about 1.3 cycles/limb.
diff --git a/mpn/powerpc64/copyd.asm b/mpn/powerpc64/copyd.asm
index 6a46a433c..256e7dc12 100644
--- a/mpn/powerpc64/copyd.asm
+++ b/mpn/powerpc64/copyd.asm
@@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1
-C POWER4/PPC970: 1
+C cycles/limb
+C POWER3/PPC630 1
+C POWER4/PPC970 1
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.4
C INPUT PARAMETERS
C rp r3
diff --git a/mpn/powerpc64/copyi.asm b/mpn/powerpc64/copyi.asm
index 5cb7e4856..31d1fc2e7 100644
--- a/mpn/powerpc64/copyi.asm
+++ b/mpn/powerpc64/copyi.asm
@@ -19,9 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1
-C POWER4/PPC970: 1
+C cycles/limb
+C POWER3/PPC630 1
+C POWER4/PPC970 1
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.4
C INPUT PARAMETERS
C rp r3
diff --git a/mpn/powerpc64/logops_n.asm b/mpn/powerpc64/logops_n.asm
index 917b59f45..2caa2c7c6 100644
--- a/mpn/powerpc64/logops_n.asm
+++ b/mpn/powerpc64/logops_n.asm
@@ -20,9 +20,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 1.75
-C POWER4/PPC970: 2.10
+C cycles/limb
+C POWER3/PPC630 1.75
+C POWER4/PPC970 2.10
+C POWER5 ?
+C POWER6 ?
+C POWER7 1.75
C n POWER3/PPC630 POWER4/PPC970
C 1 15.00 15.33
diff --git a/mpn/powerpc64/lshift.asm b/mpn/powerpc64/lshift.asm
index f97661ae7..eb70c4983 100644
--- a/mpn/powerpc64/lshift.asm
+++ b/mpn/powerpc64/lshift.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.75
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.75
+C POWER7 2.15
C TODO
C * Try to reduce the number of needed live registers
diff --git a/mpn/powerpc64/mode64/lshiftc.asm b/mpn/powerpc64/lshiftc.asm
index 647244d1f..8f470a5f4 100644
--- a/mpn/powerpc64/mode64/lshiftc.asm
+++ b/mpn/powerpc64/lshiftc.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.5
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.5
+C POWER7 2.15
C TODO
C * Try to reduce the number of needed live registers
@@ -189,6 +190,9 @@ L(cj2): std r10, -32(rp)
L(ret): ld r31, -8(r1)
ld r30, -16(r1)
- mr r3, retval
+ifdef(`HAVE_ABI_mode32',
+` srdi r3, retval, 32
+ mr r4, retval
+',` mr r3, retval')
blr
EPILOGUE()
diff --git a/mpn/powerpc64/mode64/aors_n.asm b/mpn/powerpc64/mode64/aors_n.asm
index 980525f67..8c30871c2 100644
--- a/mpn/powerpc64/mode64/aors_n.asm
+++ b/mpn/powerpc64/mode64/aors_n.asm
@@ -1,6 +1,6 @@
dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
-dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007 Free Software
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -20,11 +20,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 1.5
-C POWER4/PPC970 2
-C POWER5 2.25
-C POWER6 2.63
+C cycles/limb
+C POWER3/PPC630 1.5
+C POWER4/PPC970 2
+C POWER5 2
+C POWER6 2.63
+C POWER7 2.25-2.87
C This code is a little bit slower for POWER3/PPC630 than the simple code used
C previously, but it is much faster for POWER4/PPC970. The reason for the
@@ -136,6 +137,7 @@ L(go): ld r6, 0(r4) C load s1 limb
addi r4, r4, 32
addi r5, r5, 32
+ ALIGN(16)
L(top): ADDSUBC r28, r7, r6
ld r6, 0(r4) C load s1 limb
ld r7, 0(r5) C load s2 limb
diff --git a/mpn/powerpc64/mode64/aorscnd_n.asm b/mpn/powerpc64/mode64/aorscnd_n.asm
new file mode 100644
index 000000000..47aa6fb39
--- /dev/null
+++ b/mpn/powerpc64/mode64/aorscnd_n.asm
@@ -0,0 +1,185 @@
+dnl PowerPC-64 mpn_addcnd_n/mpn_subcnd_n.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 2.25
+C POWER5 ?
+C POWER6 ?
+C POWER7 ?
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+define(`cnd', `r7')
+
+ifdef(`OPERATION_addcnd_n',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_addcnd_n)
+ define(GENRVAL, `addi r3, r3, 1')
+ define(SETCBR, `addic r0, $1, -1')
+ define(CLRCB, `addic r0, r0, 0')
+')
+ifdef(`OPERATION_subcnd_n',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_subcnd_n)
+ define(GENRVAL, `neg r3, r3')
+ define(SETCBR, `subfic r0, $1, 0')
+ define(CLRCB, `addic r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n)
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+
+ subfic cnd, cnd, 0
+ subfe cnd, cnd, cnd
+
+ rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi r6, r6, 3 C compute count...
+ srdi r6, r6, 2 C ...for ctr
+ mtctr r6 C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r8, 0(up) C load s1 limb
+ ld r9, 0(vp) C load s2 limb
+ ld r10, 8(up) C load s1 limb
+ ld r11, 8(vp) C load s2 limb
+ ld r12, 16(up) C load s1 limb
+ addi up, up, 24
+ ld r0, 16(vp) C load s2 limb
+ addi vp, vp, 24
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ ADDSUB r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r29, 0(rp)
+ std r30, 8(rp)
+ std r31, 16(rp)
+ addi rp, rp, 24
+ bdnz L(go)
+ b L(ret)
+
+L(b01): ld r12, 0(up) C load s1 limb
+ addi up, up, 8
+ ld r0, 0(vp) C load s2 limb
+ addi vp, vp, 8
+ and r0, r0, cnd
+ ADDSUB r31, r0, r12 C add
+ std r31, 0(rp)
+ addi rp, rp, 8
+ bdnz L(go)
+ b L(ret)
+
+L(b10): ld r10, 0(up) C load s1 limb
+ ld r11, 0(vp) C load s2 limb
+ ld r12, 8(up) C load s1 limb
+ addi up, up, 16
+ ld r0, 8(vp) C load s2 limb
+ addi vp, vp, 16
+ and r11, r11, cnd
+ and r0, r0, cnd
+ ADDSUB r30, r11, r10 C add
+ ADDSUBC r31, r0, r12 C add
+ std r30, 0(rp)
+ std r31, 8(rp)
+ addi rp, rp, 16
+ bdnz L(go)
+ b L(ret)
+
+L(b00): CLRCB C clear/set cy
+L(go): ld r6, 0(up) C load s1 limb
+ ld r27, 0(vp) C load s2 limb
+ ld r8, 8(up) C load s1 limb
+ ld r9, 8(vp) C load s2 limb
+ ld r10, 16(up) C load s1 limb
+ ld r11, 16(vp) C load s2 limb
+ ld r12, 24(up) C load s1 limb
+ ld r0, 24(vp) C load s2 limb
+ and r27, r27, cnd
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ bdz L(end)
+
+ addi up, up, 32
+ addi vp, vp, 32
+
+L(top): ADDSUBC r28, r27, r6
+ ld r6, 0(up) C load s1 limb
+ ld r27, 0(vp) C load s2 limb
+ ADDSUBC r29, r9, r8
+ ld r8, 8(up) C load s1 limb
+ ld r9, 8(vp) C load s2 limb
+ ADDSUBC r30, r11, r10
+ ld r10, 16(up) C load s1 limb
+ ld r11, 16(vp) C load s2 limb
+ ADDSUBC r31, r0, r12
+ ld r12, 24(up) C load s1 limb
+ ld r0, 24(vp) C load s2 limb
+ std r28, 0(rp)
+ addi up, up, 32
+ std r29, 8(rp)
+ addi vp, vp, 32
+ std r30, 16(rp)
+ std r31, 24(rp)
+ addi rp, rp, 32
+ and r27, r27, cnd
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ bdnz L(top) C decrement ctr and loop back
+
+L(end): ADDSUBC r28, r27, r6
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r28, 0(rp)
+ std r29, 8(rp)
+ std r30, 16(rp)
+ std r31, 24(rp)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+
+ subfe r3, r0, r0 C -cy
+ GENRVAL
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc64/mode64/aorslshC_n.asm b/mpn/powerpc64/mode64/aorslshC_n.asm
index 4622cd946..3776d3e59 100644
--- a/mpn/powerpc64/mode64/aorslshC_n.asm
+++ b/mpn/powerpc64/mode64/aorslshC_n.asm
@@ -17,11 +17,12 @@ dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-C cycles/limb
-C POWER3/PPC630 1.83 (1.5 c/l should be possible)
-C POWER4/PPC970 3 (2.0 c/l should be possible)
-C POWER5 3
-C POWER6 3.5-47
+C cycles/limb
+C POWER3/PPC630 1.83 (1.5 c/l should be possible)
+C POWER4/PPC970 3 (2.0 c/l should be possible)
+C POWER5 3
+C POWER6 3.5-47
+C POWER7 3
C STATUS
C * Try combining upx+up, and vpx+vp.
diff --git a/mpn/powerpc64/mode64/aorsmul_1.asm b/mpn/powerpc64/mode64/aorsmul_1.asm
index b1a3315b6..4b843a044 100644
--- a/mpn/powerpc64/mode64/aorsmul_1.asm
+++ b/mpn/powerpc64/mode64/aorsmul_1.asm
@@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C mpn_addmul_1 mpn_submul_1
-C cycles/limb cycles/limb
-C POWER3/PPC630 6-18 6-18
-C POWER4/PPC970 8 8.3
-C POWER5 8 8.25
-C POWER6 16.25 16.75
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 6-18 6-18
+C POWER4/PPC970 8 8.3
+C POWER5 8 8.25
+C POWER6 16.25 16.75
+C POWER7 3.77 4.9
C TODO
C * Try to reduce the number of needed live registers
@@ -53,7 +54,7 @@ ifdef(`OPERATION_submul_1',`
')
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
+
ASM_START()
PROLOGUE(func_nc)
EPILOGUE()
diff --git a/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/mpn/powerpc64/mode64/bdiv_dbm1c.asm
index 40f3d4ec7..e88fc4440 100644
--- a/mpn/powerpc64/mode64/bdiv_dbm1c.asm
+++ b/mpn/powerpc64/mode64/bdiv_dbm1c.asm
@@ -19,11 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
+C cycles/limb
C POWER3/PPC630 6-18
C POWER4/PPC970 8.5?
C POWER5 8.5 fluctuating as function of n % 3
C POWER6 15
+C POWER6 15
+C POWER7 4.75
C TODO
C * Nothing to do...
diff --git a/mpn/powerpc64/mode64/dive_1.asm b/mpn/powerpc64/mode64/dive_1.asm
index d457d65e9..0f94154bf 100644
--- a/mpn/powerpc64/mode64/dive_1.asm
+++ b/mpn/powerpc64/mode64/dive_1.asm
@@ -19,12 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm unorm
+C cycles/limb
+C norm unorm
C POWER3/PPC630 13-19
-C POWER4/PPC970 16
-C POWER5 16 16
-C POWER6 37 46
+C POWER4/PPC970 16
+C POWER5 16 16
+C POWER6 37 46
+C POWER7 12 12
C TODO
C * Check if n=1 code is really an improvement. It probably isn't.
diff --git a/mpn/powerpc64/mode64/divrem_1.asm b/mpn/powerpc64/mode64/divrem_1.asm
index 9d065b728..c0e7b2a9f 100644
--- a/mpn/powerpc64/mode64/divrem_1.asm
+++ b/mpn/powerpc64/mode64/divrem_1.asm
@@ -20,12 +20,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm unorm frac
-C POWER3/PPC630 16-34 16-34 ~11
-C POWER4/PPC970 29 19
-C POWER5 29 29 ~20
-C POWER6 50 59 ~42
+C cycles/limb
+C norm unorm frac
+C POWER3/PPC630 16-34 16-34 ~11
+C POWER4/PPC970 29 19
+C POWER5 29 29 ~20
+C POWER6 50 59 ~42
+C POWER7 25 25 ~14
C INPUT PARAMETERS
C qp = r3
diff --git a/mpn/powerpc64/mode64/divrem_2.asm b/mpn/powerpc64/mode64/divrem_2.asm
index 53ef1c708..18f549357 100644
--- a/mpn/powerpc64/mode64/divrem_2.asm
+++ b/mpn/powerpc64/mode64/divrem_2.asm
@@ -19,12 +19,13 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C norm frac
+C cycles/limb
+C norm frac
C POWER3/PPC630
-C POWER4/PPC970 ? ?
-C POWER5 37 ?
-C POWER6 62 ?
+C POWER4/PPC970 ? ?
+C POWER5 37 ?
+C POWER6 62 ?
+C POWER6 30.5 ?
C INPUT PARAMETERS
C qp = r3
diff --git a/mpn/powerpc64/mode64/invert_limb.asm b/mpn/powerpc64/mode64/invert_limb.asm
index aed0a32ab..31b243001 100644
--- a/mpn/powerpc64/mode64/invert_limb.asm
+++ b/mpn/powerpc64/mode64/invert_limb.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb (approximate)
-C POWER3/PPC630 80
-C POWER4/PPC970 86
-C POWER5 86
-C POWER6 170
+C cycles/limb (approximate)
+C POWER3/PPC630 80
+C POWER4/PPC970 86
+C POWER5 86
+C POWER6 170
+C POWER7 66
ASM_START()
PROLOGUE(mpn_invert_limb)
diff --git a/mpn/powerpc64/mode64/mod_1_1.asm b/mpn/powerpc64/mode64/mod_1_1.asm
index 61e39310a..f24ceb2c8 100644
--- a/mpn/powerpc64/mode64/mod_1_1.asm
+++ b/mpn/powerpc64/mode64/mod_1_1.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 17
-C POWER5 16
-C POWER6 30
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 17
+C POWER5 16
+C POWER6 30
+C POWER7 10.2
C TODO
C * Optimise, in particular the cps function. This was compiler-generated and
diff --git a/mpn/powerpc64/mode64/mod_1_4.asm b/mpn/powerpc64/mode64/mod_1_4.asm
index e0f26da96..b6163c5e7 100644
--- a/mpn/powerpc64/mode64/mod_1_4.asm
+++ b/mpn/powerpc64/mode64/mod_1_4.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 9
-C POWER5 9
-C POWER6 13
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 9
+C POWER5 9
+C POWER6 13
+C POWER7 3.5
C TODO
C * Optimise, in particular the cps function. This was compiler-generated and
diff --git a/mpn/powerpc64/mode64/mod_34lsub1.asm b/mpn/powerpc64/mode64/mod_34lsub1.asm
index 62ba17a3c..30b9f98be 100644
--- a/mpn/powerpc64/mode64/mod_34lsub1.asm
+++ b/mpn/powerpc64/mode64/mod_34lsub1.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 1.33
-C POWER4/PPC970 1.5
-C POWER5 1.32
-C POWER6 2.35
+C cycles/limb
+C POWER3/PPC630 1.33
+C POWER4/PPC970 1.5
+C POWER5 1.32
+C POWER6 2.35
+C POWER7 1
C INPUT PARAMETERS
define(`up',`r3')
diff --git a/mpn/powerpc64/mode64/mode1o.asm b/mpn/powerpc64/mode64/mode1o.asm
index 489ca8551..37e4028d8 100644
--- a/mpn/powerpc64/mode64/mode1o.asm
+++ b/mpn/powerpc64/mode64/mode1o.asm
@@ -19,10 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 13-19
-C POWER4/PPC970: 16
-C POWER5: 16
+C cycles/limb
+C POWER3/PPC630 13-19
+C POWER4/PPC970 16
+C POWER5 16
+C POWER6 ?
+C POWER7 12
C TODO
C * Check if n=1 code is really an improvement. It probably isn't.
diff --git a/mpn/powerpc64/mode64/mul_1.asm b/mpn/powerpc64/mode64/mul_1.asm
index 12bff2fb6..e911cf551 100644
--- a/mpn/powerpc64/mode64/mul_1.asm
+++ b/mpn/powerpc64/mode64/mul_1.asm
@@ -21,11 +21,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 6-18
-C POWER4/PPC970 7.25? not updated for last file revision
-C POWER5 7.25
-C POWER6 14
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 7.25? not updated for last file revision
+C POWER5 7.25
+C POWER6 14
+C POWER7 2.9
C TODO
C * Try to reduce the number of needed live registers (at least r5 and r10
diff --git a/mpn/powerpc64/mode64/mul_basecase.asm b/mpn/powerpc64/mode64/mul_basecase.asm
index fd7ff9aa1..9a3957f94 100644
--- a/mpn/powerpc64/mode64/mul_basecase.asm
+++ b/mpn/powerpc64/mode64/mul_basecase.asm
@@ -1,4 +1,4 @@
-dnl PowerPC-64 mpn_basecase.
+dnl PowerPC-64 mpn_mul_basecase.
dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software
dnl Foundation, Inc.
@@ -20,11 +20,11 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 6-18
-C POWER4/PPC970 8
-C POWER5 8
-C POWER6 24
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8
+C POWER5 8
+C POWER6 24
C INPUT PARAMETERS
define(`rp', `r3')
diff --git a/mpn/powerpc64/mode64/p3/gmp-mparam.h b/mpn/powerpc64/mode64/p3/gmp-mparam.h
index 221b0e1d8..cf1d8ca47 100644
--- a/mpn/powerpc64/mode64/p3/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p3/gmp-mparam.h
@@ -23,12 +23,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 16
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
@@ -36,22 +37,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM33_THRESHOLD 33
#define MUL_TOOM44_THRESHOLD 46
#define MUL_TOOM6H_THRESHOLD 77
-#define MUL_TOOM8H_THRESHOLD 115
+#define MUL_TOOM8H_THRESHOLD 139
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 38
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 33
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 32
-
-#define SQR_BASECASE_THRESHOLD 0 /* always */
-#define SQR_TOOM2_THRESHOLD 16
-#define SQR_TOOM3_THRESHOLD 49
-#define SQR_TOOM4_THRESHOLD 70
-#define SQR_TOOM6_THRESHOLD 93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 48
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 49
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 49
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 14
+#define SQR_TOOM3_THRESHOLD 45
+#define SQR_TOOM4_THRESHOLD 64
+#define SQR_TOOM6_THRESHOLD 85
#define SQR_TOOM8_THRESHOLD 139
+#define MULMID_TOOM42_THRESHOLD 22
+
#define MULMOD_BNM1_THRESHOLD 8
-#define SQRMOD_BNM1_THRESHOLD 9
+#define SQRMOD_BNM1_THRESHOLD 10
+
+#define POWM_SEC_TABLE 2,23,127,502,1421
#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -123,35 +128,37 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 118
#define SQR_FFT_THRESHOLD 1728
-#define MULLO_BASECASE_THRESHOLD 3
-#define MULLO_DC_THRESHOLD 28
-#define MULLO_MUL_N_THRESHOLD 4940
+#define MULLO_BASECASE_THRESHOLD 2
+#define MULLO_DC_THRESHOLD 27
+#define MULLO_MUL_N_THRESHOLD 2367
-#define DC_DIV_QR_THRESHOLD 27
-#define DC_DIVAPPR_Q_THRESHOLD 95
-#define DC_BDIV_QR_THRESHOLD 28
+#define DC_DIV_QR_THRESHOLD 26
+#define DC_DIVAPPR_Q_THRESHOLD 87
+#define DC_BDIV_QR_THRESHOLD 27
#define DC_BDIV_Q_THRESHOLD 62
-#define INV_MULMOD_BNM1_THRESHOLD 29
-#define INV_NEWTON_THRESHOLD 92
-#define INV_APPR_THRESHOLD 94
+#define INV_MULMOD_BNM1_THRESHOLD 34
+#define INV_NEWTON_THRESHOLD 91
+#define INV_APPR_THRESHOLD 91
#define BINV_NEWTON_THRESHOLD 115
-#define REDC_1_TO_REDC_N_THRESHOLD 30
+#define REDC_1_TO_REDC_N_THRESHOLD 31
#define MU_DIV_QR_THRESHOLD 551
#define MU_DIVAPPR_Q_THRESHOLD 551
-#define MUPI_DIV_QR_THRESHOLD 49
-#define MU_BDIV_QR_THRESHOLD 492
+#define MUPI_DIV_QR_THRESHOLD 50
+#define MU_BDIV_QR_THRESHOLD 474
#define MU_BDIV_Q_THRESHOLD 492
-#define MATRIX22_STRASSEN_THRESHOLD 9
-#define HGCD_THRESHOLD 55
-#define GCD_DC_THRESHOLD 150
-#define GCDEXT_DC_THRESHOLD 124
+#define MATRIX22_STRASSEN_THRESHOLD 8
+#define HGCD_THRESHOLD 53
+#define HGCD_APPR_THRESHOLD 55
+#define HGCD_REDUCE_THRESHOLD 688
+#define GCD_DC_THRESHOLD 148
+#define GCDEXT_DC_THRESHOLD 118
#define JACOBI_BASE_METHOD 1
-#define GET_STR_DC_THRESHOLD 17
+#define GET_STR_DC_THRESHOLD 16
#define GET_STR_PRECOMPUTE_THRESHOLD 27
-#define SET_STR_DC_THRESHOLD 354
+#define SET_STR_DC_THRESHOLD 375
#define SET_STR_PRECOMPUTE_THRESHOLD 812
diff --git a/mpn/powerpc64/mode64/p4/gmp-mparam.h b/mpn/powerpc64/mode64/p4/gmp-mparam.h
index 9a0932654..317bc94d6 100644
--- a/mpn/powerpc64/mode64/p4/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p4/gmp-mparam.h
@@ -29,6 +29,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 37
@@ -43,16 +44,20 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 62
-#define SQR_BASECASE_THRESHOLD 5
-#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 57
-#define SQR_TOOM4_THRESHOLD 136
-#define SQR_TOOM6_THRESHOLD 181
-#define SQR_TOOM8_THRESHOLD 272
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 214
+#define SQR_TOOM6_THRESHOLD 254
+#define SQR_TOOM8_THRESHOLD 430
-#define MULMOD_BNM1_THRESHOLD 13
+#define MULMID_TOOM42_THRESHOLD 32
+
+#define MULMOD_BNM1_THRESHOLD 12
#define SQRMOD_BNM1_THRESHOLD 16
+#define POWM_SEC_TABLE 6,47,347,1036,2826
+
#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 372, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \
@@ -116,9 +121,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 103
#define SQR_FFT_THRESHOLD 2752
-#define MULLO_BASECASE_THRESHOLD 5
+#define MULLO_BASECASE_THRESHOLD 3
#define MULLO_DC_THRESHOLD 36
-#define MULLO_MUL_N_THRESHOLD 12691
+#define MULLO_MUL_N_THRESHOLD 13463
#define DC_DIV_QR_THRESHOLD 43
#define DC_DIVAPPR_Q_THRESHOLD 158
@@ -139,12 +144,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 998
#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 105
+#define HGCD_THRESHOLD 103
+#define HGCD_APPR_THRESHOLD 110
+#define HGCD_REDUCE_THRESHOLD 1962
#define GCD_DC_THRESHOLD 318
#define GCDEXT_DC_THRESHOLD 242
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 12
#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 858
-#define SET_STR_PRECOMPUTE_THRESHOLD 1864
+#define SET_STR_DC_THRESHOLD 650
+#define SET_STR_PRECOMPUTE_THRESHOLD 1781
diff --git a/mpn/powerpc64/mode64/p5/gmp-mparam.h b/mpn/powerpc64/mode64/p5/gmp-mparam.h
index 827b555c8..9220f99d5 100644
--- a/mpn/powerpc64/mode64/p5/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p5/gmp-mparam.h
@@ -1,4 +1,4 @@
-/* gmp-mparam.h -- Compiler/machine parameter header file.
+/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file.
Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free
Software Foundation, Inc.
@@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 40
@@ -38,22 +39,26 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM33_THRESHOLD 50
#define MUL_TOOM44_THRESHOLD 121
#define MUL_TOOM6H_THRESHOLD 202
-#define MUL_TOOM8H_THRESHOLD 303
+#define MUL_TOOM8H_THRESHOLD 260
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 82
#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88
-#define SQR_BASECASE_THRESHOLD 9
-#define SQR_TOOM2_THRESHOLD 36
-#define SQR_TOOM3_THRESHOLD 59
-#define SQR_TOOM4_THRESHOLD 147
-#define SQR_TOOM6_THRESHOLD 204
-#define SQR_TOOM8_THRESHOLD 288
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 142
+#define SQR_TOOM6_THRESHOLD 191
+#define SQR_TOOM8_THRESHOLD 284
-#define MULMOD_BNM1_THRESHOLD 14
-#define SQRMOD_BNM1_THRESHOLD 16
+#define MULMID_TOOM42_THRESHOLD 32
+
+#define MULMOD_BNM1_THRESHOLD 12
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 4,35,387,1068,2699
#define MUL_FFT_MODF_THRESHOLD 348 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -166,15 +171,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 2752
#define MULLO_BASECASE_THRESHOLD 0
-#define MULLO_DC_THRESHOLD 31
+#define MULLO_DC_THRESHOLD 42
#define MULLO_MUL_N_THRESHOLD 6633
-#define DC_DIV_QR_THRESHOLD 37
+#define DC_DIV_QR_THRESHOLD 43
#define DC_DIVAPPR_Q_THRESHOLD 155
#define DC_BDIV_QR_THRESHOLD 46
-#define DC_BDIV_Q_THRESHOLD 112
+#define DC_BDIV_Q_THRESHOLD 120
-#define INV_MULMOD_BNM1_THRESHOLD 26
+#define INV_MULMOD_BNM1_THRESHOLD 52
#define INV_NEWTON_THRESHOLD 177
#define INV_APPR_THRESHOLD 165
@@ -189,11 +194,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MATRIX22_STRASSEN_THRESHOLD 15
#define HGCD_THRESHOLD 108
-#define GCD_DC_THRESHOLD 303
+#define HGCD_APPR_THRESHOLD 113
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 315
#define GCDEXT_DC_THRESHOLD 237
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 13
#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 532
-#define SET_STR_PRECOMPUTE_THRESHOLD 1639
+#define SET_STR_DC_THRESHOLD 650
+#define SET_STR_PRECOMPUTE_THRESHOLD 1585
diff --git a/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/mpn/powerpc64/mode64/p6/aorsmul_1.asm
new file mode 100644
index 000000000..4bd508488
--- /dev/null
+++ b/mpn/powerpc64/mode64/p6/aorsmul_1.asm
@@ -0,0 +1,172 @@
+dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011
+dnl Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 ? ?
+C POWER4/PPC970 ? ?
+C POWER5 ? ?
+C POWER6 12.25 12.8
+C POWER7 ? ?
+
+C TODO
+C * Reduce register usage.
+C * Schedule function entry code.
+C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
+C would bring us to 9 c/l.
+C * Handle n = 1 and perhaps n = 2 seperately, without saving any registers.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+ifdef(`OPERATION_addmul_1',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_addmul_1)
+ define(func_nc, mpn_addmul_1c) C FIXME: not really supported
+ define(AM, `$1')
+ define(SM, `')
+ define(CLRRSC, `addic $1, r0, 0')
+')
+ifdef(`OPERATION_submul_1',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_submul_1)
+ define(func_nc, mpn_submul_1c) C FIXME: not really supported
+ define(AM, `')
+ define(SM, `$1')
+ define(CLRRSC, `subfc $1, r0, r0')
+')
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 3 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy loop count into ctr
+ beq cr0, L(b0)
+ blt cr6, L(b1)
+ beq cr6, L(b2)
+
+L(b3): ld r8, 0(up)
+ ld r7, 8(up)
+ ld r27, 16(up)
+ addi up, up, 16
+ addi rp, rp, 16
+ mulld r5, r8, v0
+ mulhdu r8, r8, v0
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r29, -16(rp)
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ addc r9, r9, r8
+ adde r11, r11, r7
+ addze r12, r27
+ ADDSUB r5, r5, r29
+ b L(l3)
+
+L(b2): ld r7, 0(up)
+ ld r27, 8(up)
+ addi up, up, 8
+ addi rp, rp, 8
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ addc r11, r11, r7
+ addze r12, r27
+ ADDSUB r9, r9, r30
+ b L(l2)
+
+L(b1): ld r27, 0(up)
+ ld r31, 0(rp)
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ADDSUB r11, r11, r31
+ b L(l1)
+
+L(b0): addi up, up, -8
+ addi rp, rp, -8
+ CLRRSC( r12) C clear r12 and clr/set cy
+
+ ALIGN(32)
+L(top):
+SM(` subfe r11, r0, r0') C complement...
+SM(` addic r11, r11, 1') C ...carry flag
+ ld r10, 8(up)
+ ld r8, 16(up)
+ ld r7, 24(up)
+ ld r27, 32(up)
+ addi up, up, 32
+ addi rp, rp, 32
+ mulld r0, r10, v0
+ mulhdu r10, r10, v0
+ mulld r5, r8, v0
+ mulhdu r8, r8, v0
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r28, -24(rp)
+ adde r0, r0, r12
+ ld r29, -16(rp)
+ adde r5, r5, r10
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ adde r9, r9, r8
+ adde r11, r11, r7
+ addze r12, r27
+ ADDSUB r0, r0, r28
+ std r0, -24(rp)
+ ADDSUBC r5, r5, r29
+L(l3): std r5, -16(rp)
+ ADDSUBC r9, r9, r30
+L(l2): std r9, -8(rp)
+ ADDSUBC r11, r11, r31
+L(l1): std r11, 0(rp)
+ bdnz L(top)
+
+AM(` addze r3, r12')
+SM(` subfe r11, r0, r0') C complement...
+ ld r31, -8(r1)
+SM(` subf r3, r11, r12')
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h
index d447b56d9..5392138f1 100644
--- a/mpn/powerpc64/mode64/p6/gmp-mparam.h
+++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h
@@ -1,7 +1,7 @@
-/* gmp-mparam.h -- Compiler/machine parameter header file.
+/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free
-Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011
+Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5
#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 21
@@ -38,23 +39,27 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM33_THRESHOLD 50
#define MUL_TOOM44_THRESHOLD 112
#define MUL_TOOM6H_THRESHOLD 274
-#define MUL_TOOM8H_THRESHOLD 430
+#define MUL_TOOM8H_THRESHOLD 339
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 62
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 84
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76
#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 78
-#define SQR_BASECASE_THRESHOLD 9
-#define SQR_TOOM2_THRESHOLD 30
-#define SQR_TOOM3_THRESHOLD 53
-#define SQR_TOOM4_THRESHOLD 148
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 49
+#define SQR_TOOM4_THRESHOLD 136
#define SQR_TOOM6_THRESHOLD 226
-#define SQR_TOOM8_THRESHOLD 430
+#define SQR_TOOM8_THRESHOLD 393
+
+#define MULMID_TOOM42_THRESHOLD 36
#define MULMOD_BNM1_THRESHOLD 14
#define SQRMOD_BNM1_THRESHOLD 14
+#define POWM_SEC_TABLE 4,23,213,840,2618
+
#define MUL_FFT_MODF_THRESHOLD 340 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
@@ -106,34 +111,36 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 2368
#define MULLO_BASECASE_THRESHOLD 5
-#define MULLO_DC_THRESHOLD 28
-#define MULLO_MUL_N_THRESHOLD 6633
-
-#define DC_DIV_QR_THRESHOLD 27
-#define DC_DIVAPPR_Q_THRESHOLD 112
-#define DC_BDIV_QR_THRESHOLD 29
-#define DC_BDIV_Q_THRESHOLD 86
-
-#define INV_MULMOD_BNM1_THRESHOLD 47
-#define INV_NEWTON_THRESHOLD 93
-#define INV_APPR_THRESHOLD 91
-
-#define BINV_NEWTON_THRESHOLD 132
-#define REDC_1_TO_REDC_N_THRESHOLD 39
-
-#define MU_DIV_QR_THRESHOLD 855
-#define MU_DIVAPPR_Q_THRESHOLD 807
-#define MUPI_DIV_QR_THRESHOLD 33
-#define MU_BDIV_QR_THRESHOLD 807
-#define MU_BDIV_Q_THRESHOLD 872
-
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 64
-#define GCD_DC_THRESHOLD 237
-#define GCDEXT_DC_THRESHOLD 183
+#define MULLO_DC_THRESHOLD 61
+#define MULLO_MUL_N_THRESHOLD 3271
+
+#define DC_DIV_QR_THRESHOLD 59
+#define DC_DIVAPPR_Q_THRESHOLD 200
+#define DC_BDIV_QR_THRESHOLD 70
+#define DC_BDIV_Q_THRESHOLD 168
+
+#define INV_MULMOD_BNM1_THRESHOLD 61
+#define INV_NEWTON_THRESHOLD 166
+#define INV_APPR_THRESHOLD 166
+
+#define BINV_NEWTON_THRESHOLD 222
+#define REDC_1_TO_REDC_N_THRESHOLD 63
+
+#define MU_DIV_QR_THRESHOLD 998
+#define MU_DIVAPPR_Q_THRESHOLD 979
+#define MUPI_DIV_QR_THRESHOLD 59
+#define MU_BDIV_QR_THRESHOLD 889
+#define MU_BDIV_Q_THRESHOLD 1078
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD_THRESHOLD 109
+#define HGCD_APPR_THRESHOLD 108
+#define HGCD_REDUCE_THRESHOLD 1052
+#define GCD_DC_THRESHOLD 501
+#define GCDEXT_DC_THRESHOLD 249
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 17
-#define GET_STR_PRECOMPUTE_THRESHOLD 27
+#define GET_STR_DC_THRESHOLD 16
+#define GET_STR_PRECOMPUTE_THRESHOLD 29
#define SET_STR_DC_THRESHOLD 532
-#define SET_STR_PRECOMPUTE_THRESHOLD 1648
+#define SET_STR_PRECOMPUTE_THRESHOLD 1639
diff --git a/mpn/powerpc64/mode64/p6/mul_basecase.asm b/mpn/powerpc64/mode64/p6/mul_basecase.asm
index 427d6081a..52c5af8ff 100644
--- a/mpn/powerpc64/mode64/p6/mul_basecase.asm
+++ b/mpn/powerpc64/mode64/p6/mul_basecase.asm
@@ -1,4 +1,4 @@
-dnl PowerPC-64 mpn_basecase.
+dnl PowerPC-64 mpn_mul_basecase.
dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010 Free
dnl Software Foundation, Inc.
diff --git a/mpn/powerpc64/mode64/p7/gmp-mparam.h b/mpn/powerpc64/mode64/p7/gmp-mparam.h
new file mode 100644
index 000000000..02603c525
--- /dev/null
+++ b/mpn/powerpc64/mode64/p7/gmp-mparam.h
@@ -0,0 +1,159 @@
+/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011
+Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define BYTES_PER_MP_LIMB 8
+
+/* 3550 MHz POWER7 (gcc110.fsffrance.org) */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 7
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 28
+
+#define MUL_TOOM22_THRESHOLD 22
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 202
+#define MUL_TOOM6H_THRESHOLD 298
+#define MUL_TOOM8H_THRESHOLD 406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 143
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 135
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 141
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 36
+#define SQR_TOOM3_THRESHOLD 109
+#define SQR_TOOM4_THRESHOLD 202
+#define SQR_TOOM6_THRESHOLD 303
+#define SQR_TOOM8_THRESHOLD 399
+
+#define MULMID_TOOM42_THRESHOLD 62
+
+#define MULMOD_BNM1_THRESHOLD 15
+#define SQRMOD_BNM1_THRESHOLD 16
+
+#define POWM_SEC_TABLE 6,65,342,1465
+
+#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 436, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 43,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
+ { 159,11}, { 95,10}, { 191,11}, { 111,12}, \
+ { 63,11}, { 127,10}, { 255,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,12}, \
+ { 95,11}, { 191,10}, { 383,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 575,11}, \
+ { 303,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 335,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 106
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
+ { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
+ { 639,11}, { 175,12}, { 95,11}, { 191,10}, \
+ { 383, 9}, { 767,11}, { 207,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
+ { 159,11}, { 319,10}, { 639, 9}, { 1279,10}, \
+ { 671,11}, { 351,10}, { 703,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,10}, { 831,12}, \
+ { 223,11}, { 447,10}, { 895,11}, { 479,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 103
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 23
+#define MULLO_MUL_N_THRESHOLD 9174
+
+#define DC_DIV_QR_THRESHOLD 30
+#define DC_DIVAPPR_Q_THRESHOLD 124
+#define DC_BDIV_QR_THRESHOLD 66
+#define DC_BDIV_Q_THRESHOLD 160
+
+#define INV_MULMOD_BNM1_THRESHOLD 81
+#define INV_NEWTON_THRESHOLD 165
+#define INV_APPR_THRESHOLD 133
+
+#define BINV_NEWTON_THRESHOLD 300
+#define REDC_1_TO_REDC_N_THRESHOLD 76
+
+#define MU_DIV_QR_THRESHOLD 1470
+#define MU_DIVAPPR_Q_THRESHOLD 1442
+#define MUPI_DIV_QR_THRESHOLD 58
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1499
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 124
+#define HGCD_APPR_THRESHOLD 155
+#define HGCD_REDUCE_THRESHOLD 3134
+#define GCD_DC_THRESHOLD 492
+#define GCDEXT_DC_THRESHOLD 333
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 17
+#define SET_STR_DC_THRESHOLD 1517
+#define SET_STR_PRECOMPUTE_THRESHOLD 3421
diff --git a/mpn/powerpc64/mode64/rsh1add_n.asm b/mpn/powerpc64/mode64/rsh1add_n.asm
index 8af3ca774..2a5ef3060 100644
--- a/mpn/powerpc64/mode64/rsh1add_n.asm
+++ b/mpn/powerpc64/mode64/rsh1add_n.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 2 (1.5 c/l should be possible)
-C POWER4/PPC970 4 (2.0 c/l should be possible)
-C POWER5 3.5 (2.0 c/l should be possible)
-C POWER6 4.5
+C cycles/limb
+C POWER3/PPC630 2 (1.5 c/l should be possible)
+C POWER4/PPC970 4 (2.0 c/l should be possible)
+C POWER5 3.5 (2.0 c/l should be possible)
+C POWER6 4.5
+C POWER7 3.5
define(`rp',`r3')
define(`up',`r4')
diff --git a/mpn/powerpc64/mode64/rsh1sub_n.asm b/mpn/powerpc64/mode64/rsh1sub_n.asm
index 1faa03379..b10eb8ab7 100644
--- a/mpn/powerpc64/mode64/rsh1sub_n.asm
+++ b/mpn/powerpc64/mode64/rsh1sub_n.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 2 (1.5 c/l should be possible)
-C POWER4/PPC970 4 (2.0 c/l should be possible)
-C POWER5 3.5 (2.0 c/l should be possible)
-C POWER6 4.5
+C cycles/limb
+C POWER3/PPC630 2 (1.5 c/l should be possible)
+C POWER4/PPC970 4 (2.0 c/l should be possible)
+C POWER5 3.5 (2.0 c/l should be possible)
+C POWER6 4.5
+C POWER7 3.5
define(`rp',`r3')
define(`up',`r4')
diff --git a/mpn/powerpc64/mode64/sqr_basecase.asm b/mpn/powerpc64/mode64/sqr_basecase.asm
new file mode 100644
index 000000000..72ac2d318
--- /dev/null
+++ b/mpn/powerpc64/mode64/sqr_basecase.asm
@@ -0,0 +1,852 @@
+dnl PowerPC-64 mpn_sqr_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 Free
+dnl Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8
+C POWER5 8
+C POWER6 16.25
+C POWER7 3.77
+
+C NOTES
+C * This is very crude, cleanup!
+C * Try to reduce the number of needed live registers.
+C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The
+C cost will be more live registers.
+C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
+C size a lot and speed things up perhaps 25%.
+C * Use computed goto in order to compress the code.
+C * Implement a larger final corner.
+C * Schedule callee-saves register saves into other insns. This could save
+C about 5 cycles/call. (We cannot analogously optimise the restores, since
+C the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
+C * Should the alternating std/adde sequences be split? Some pipelines handle
+C adde poorly, and might sequentialise all these instructions.
+C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
+C adjacent integer multiply insns. Except for the multiply insns, the code
+C was not carefully optimised for POWER6 or any other CPU.
+C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+
+define(`rp_outer', `r25')
+define(`up_outer', `r21')
+define(`rp_saved', `r22')
+define(`up_saved', `r23')
+define(`n_saved', `r24')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+ cmpdi cr0, n, 2
+ bge cr0, L(ge2)
+ ld r5, 0(up) C n = 1
+ nop
+ mulld r8, r5, r5 C weight 0
+ mulhdu r9, r5, r5 C weight 1
+ std r8, 0(rp)
+ std r9, 8(rp)
+ blr
+ ALIGN(16)
+L(ge2): bgt cr0, L(gt2)
+ ld r0, 0(up) C n = 2
+ nop
+ mulld r8, r0, r0 C u0 * u0
+ mulhdu r9, r0, r0 C u0 * u0
+ ld r6, 8(up)
+ mulld r10, r6, r6 C u1 * u1
+ mulhdu r11, r6, r6 C u1 * u1
+ mulld r4, r6, r0 C u1 * u0
+ mulhdu r5, r6, r0 C u1 * u0
+ addc r4, r4, r4
+ adde r5, r5, r5
+ addze r11, r11
+ addc r9, r9, r4
+ adde r10, r10, r5
+ addze r11, r11
+ std r8, 0(rp)
+ std r9, 8(rp)
+ std r10, 16(rp)
+ std r11, 24(rp)
+ blr
+
+ ALIGN(16)
+L(gt2): std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ std r25, -56(r1)
+ std r24, -64(r1)
+ std r23, -72(r1)
+ std r22, -80(r1)
+ std r21, -88(r1)
+
+ mr rp_saved, rp
+ mr up_saved, up
+ mr n_saved, n
+ mr rp_outer, rp
+ mr up_outer, up
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addic r7, n, 2 C compute count...
+ srdi r7, r7, 2 C ...for ctr
+ mtctr r7 C copy count into ctr
+ beq- cr0, L(b0)
+ blt- cr6, L(b1)
+ beq- cr6, L(b2)
+
+L(b3): ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ addi up, up, 24
+ li r12, 0 C carry limb
+ bdz L(em3)
+
+ ALIGN(16)
+L(tm3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm3)
+
+L(em3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop)
+
+L(b0): ld r6, 0(up)
+ ld r27, 8(up)
+ mulld r7, r27, r6
+ mulhdu r12, r27, r6
+ std r7, 8(rp)
+ addi rp, rp, 8
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+ bdz L(em0)
+
+ ALIGN(16)
+L(tm0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm0)
+
+L(em0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_2)
+
+L(b1): ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r12, r27, r6
+ addc r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addi rp, rp, 16
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+ bdz L(em1)
+
+ ALIGN(16)
+L(tm1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm1)
+
+L(em1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_3)
+
+L(b2): addi r7, r7, -1 C FIXME
+ mtctr r7 C FIXME
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 24(up)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r26
+ adde r11, r11, r8
+ addze r12, r10
+ std r0, 8(rp)
+ std r7, 16(rp)
+ std r11, 24(rp)
+ addi rp, rp, 24
+ ld r9, 32(up)
+ ld r27, 40(up)
+ addi up, up, 48
+ bdz L(em2)
+
+ ALIGN(16)
+L(tm2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm2)
+
+L(em2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_0)
+
+
+L(outer_loop):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ bdz L(outer_end)
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 24(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ ld r30, 16(rp)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r26
+ adde r11, r11, r8
+ addze r12, r10
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ adde r11, r11, r30
+ std r11, 16(rp)
+ addi rp, rp, 24
+ ld r9, 32(up)
+ ld r27, 40(up)
+ addi up, up, 48
+ bdz L(ea1)
+
+ ALIGN(16)
+L(ta1): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta1)
+
+L(ea1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+L(outer_loop_ent_0):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ addc r0, r0, r28
+ adde r7, r7, r26
+ addze r12, r8
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addi rp, rp, 16
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+ bdz L(ea0)
+
+ ALIGN(16)
+L(ta0): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta0)
+
+L(ea0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+L(outer_loop_ent_3):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r28, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r12, r9, r6
+ addc r0, r0, r28
+ std r0, 0(rp)
+ addi rp, rp, 8
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+ bdz L(ea3)
+
+ ALIGN(16)
+L(ta3): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta3)
+
+L(ea3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+
+L(outer_loop_ent_2):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ addic r0, r0, 0
+ li r12, 0 C cy_limb = 0
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ bdz L(ea2)
+ addi up, up, 24
+
+ ALIGN(16)
+L(ta2): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta2)
+
+L(ea2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+ b L(outer_loop)
+
+L(outer_end):
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r11, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r8, r9, r6
+ addc r0, r0, r11
+ std r0, 0(rp)
+ addze r8, r8
+ std r8, 8(rp)
+
+define(`rp', `rp_saved')
+define(`up', `r5')
+define(`n', `r6')
+define(`climb', `r0')
+
+ addi r4, rp_saved, 8
+ mr r5, up_saved
+ mr r6, n_saved
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 2 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C put loop count into ctr
+ beq cr0, L(xb0)
+ blt cr6, L(xb1)
+ beq cr6, L(xb2)
+
+L(xb3): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ addi up, up, 24
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ ld r6, 24(rp)
+ ld r7, 32(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ addze climb, r29
+ addc r10, r10, r25
+ adde r11, r11, r26
+ adde r6, r6, r27
+ adde r7, r7, r28
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ std r6, 24(rp)
+ std r7, 32(rp)
+ addi rp, rp, 40
+ bdnz L(top)
+ b L(end)
+
+L(xb2): ld r6, 0(up)
+ ld r7, 8(up)
+ addi up, up, 16
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ addze climb, r27
+ addc r10, r10, r25
+ adde r11, r11, r26
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ addi rp, rp, 24
+ bdnz L(top)
+ b L(end)
+
+L(xb0): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
+ mulhdu r31, r23, r23
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ ld r6, 24(rp)
+ ld r7, 32(rp)
+ ld r12, 40(rp)
+ ld r23, 48(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ adde r12, r12, r12
+ adde r23, r23, r23
+ addze climb, r31
+ std r24, 0(rp)
+ addc r10, r10, r25
+ std r10, 8(rp)
+ adde r11, r11, r26
+ std r11, 16(rp)
+ adde r6, r6, r27
+ std r6, 24(rp)
+ adde r7, r7, r28
+ std r7, 32(rp)
+ adde r12, r12, r29
+ std r12, 40(rp)
+ adde r23, r23, r30
+ std r23, 48(rp)
+ addi rp, rp, 56
+ bdnz L(top)
+ b L(end)
+
+L(xb1): ld r6, 0(up)
+ addi up, up, 8
+ mulld r24, r6, r6
+ mulhdu climb, r6, r6
+ std r24, 0(rp)
+ addic rp, rp, 8 C clear carry as side-effect
+
+ ALIGN(32)
+L(top): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
+ mulhdu r31, r23, r23
+ ld r8, 0(rp)
+ ld r9, 8(rp)
+ adde r8, r8, r8
+ adde r9, r9, r9
+ ld r10, 16(rp)
+ ld r11, 24(rp)
+ adde r10, r10, r10
+ adde r11, r11, r11
+ ld r6, 32(rp)
+ ld r7, 40(rp)
+ adde r6, r6, r6
+ adde r7, r7, r7
+ ld r12, 48(rp)
+ ld r23, 56(rp)
+ adde r12, r12, r12
+ adde r23, r23, r23
+ addze r31, r31
+ addc r8, r8, climb
+ std r8, 0(rp)
+ adde r9, r9, r24
+ std r9, 8(rp)
+ adde r10, r10, r25
+ std r10, 16(rp)
+ adde r11, r11, r26
+ std r11, 24(rp)
+ adde r6, r6, r27
+ std r6, 32(rp)
+ adde r7, r7, r28
+ std r7, 40(rp)
+ adde r12, r12, r29
+ std r12, 48(rp)
+ adde r23, r23, r30
+ std r23, 56(rp)
+ mr climb, r31
+ addi rp, rp, 64
+ bdnz L(top)
+
+L(end): addze climb, climb
+ std climb, 0(rp)
+
+ ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ ld r25, -56(r1)
+ ld r24, -64(r1)
+ ld r23, -72(r1)
+ ld r22, -80(r1)
+ ld r21, -88(r1)
+ blr
+EPILOGUE()
diff --git a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
deleted file mode 100644
index 663f04c14..000000000
--- a/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
+++ /dev/null
@@ -1,238 +0,0 @@
-dnl PowerPC-64 mpn_sqr_diag_addlsh1
-
-dnl Copyright 2011 Free Software Foundation, Inc.
-
-dnl This file is part of the GNU MP Library.
-
-dnl The GNU MP Library is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU Lesser General Public License as published
-dnl by the Free Software Foundation; either version 3 of the License, or (at
-dnl your option) any later version.
-
-dnl The GNU MP Library is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-dnl License for more details.
-
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C cycles/limb
-C POWER3/PPC630 10
-C POWER4/PPC970 6
-C POWER5 5.375
-C POWER6 8.5
-
-C NOTES
-C * This was written for POWER6 and its preferences for adjacent integer
-C multiply insns. The cost is that we get a large set of live registers,
-C and therefore need to save 9 callee-saves registers. Except for the
-C multiply insns, the code was not carefully optimised for POWER6 or any
-C other CPU.
-C * Perform some cross-jumping in the feed-in code, into the loop's tail.
-
-C refmpn_sqr_diag_addlsh1 (mp_ptr rp, mp_srcptr tp, mp_srcptr up, mp_size_t n)
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`tp', `r4')
-define(`up', `r5')
-define(`n', `r6')
-
-define(`climb', `r0')
-
-ASM_START()
-PROLOGUE(mpn_sqr_diag_addlsh1)
- std r31, -8(r1)
- std r30, -16(r1)
- std r29, -24(r1)
- std r28, -32(r1)
- std r27, -40(r1)
- std r26, -48(r1)
- std r25, -56(r1)
- std r24, -64(r1)
- std r23, -72(r1)
-
- rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
- cmpdi cr6, r0, 2
- addi n, n, 2 C compute count...
- srdi n, n, 2 C ...for ctr
- mtctr n C put loop count into ctr
- beq cr0, L(b0)
- blt cr6, L(b1)
- beq cr6, L(b2)
-
-L(b3): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- addi up, up, 24
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- ld r10, 0(tp)
- ld r11, 8(tp)
- ld r6, 16(tp)
- ld r7, 24(tp)
- addi tp, tp, 32
- addc r10, r10, r10
- adde r11, r11, r11
- adde r6, r6, r6
- adde r7, r7, r7
- addze climb, r29
- addc r10, r10, r25
- adde r11, r11, r26
- adde r6, r6, r27
- adde r7, r7, r28
- std r24, 0(rp)
- std r10, 8(rp)
- std r11, 16(rp)
- std r6, 24(rp)
- std r7, 32(rp)
- addi rp, rp, 40
- bdnz L(top)
- b L(end)
-
-L(b2): ld r6, 0(up)
- ld r7, 8(up)
- addi up, up, 16
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- ld r10, 0(tp)
- ld r11, 8(tp)
- addi tp, tp, 16
- addc r10, r10, r10
- adde r11, r11, r11
- addze climb, r27
- addc r10, r10, r25
- adde r11, r11, r26
- std r24, 0(rp)
- std r10, 8(rp)
- std r11, 16(rp)
- addi rp, rp, 24
- bdnz L(top)
- b L(end)
-
-L(b0): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- ld r23, 24(up)
- addi up, up, 32
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- mulld r30, r23, r23
- mulhdu r31, r23, r23
- ld r10, 0(tp)
- ld r11, 8(tp)
- ld r6, 16(tp)
- ld r7, 24(tp)
- ld r12, 32(tp)
- ld r23, 40(tp)
- addi tp, tp, 48
- addc r10, r10, r10
- adde r11, r11, r11
- adde r6, r6, r6
- adde r7, r7, r7
- adde r12, r12, r12
- adde r23, r23, r23
- addze climb, r31
- std r24, 0(rp)
- addc r10, r10, r25
- std r10, 8(rp)
- adde r11, r11, r26
- std r11, 16(rp)
- adde r6, r6, r27
- std r6, 24(rp)
- adde r7, r7, r28
- std r7, 32(rp)
- adde r12, r12, r29
- std r12, 40(rp)
- adde r23, r23, r30
- std r23, 48(rp)
- addi rp, rp, 56
- bdnz L(top)
- b L(end)
-
-L(b1): ld r6, 0(up)
- addi up, up, 8
- mulld r24, r6, r6
- mulhdu climb, r6, r6
- std r24, 0(rp)
- addic rp, rp, 8 C clear carry as side-effect
-
- ALIGN(32)
-L(top): ld r6, 0(up)
- ld r7, 8(up)
- ld r12, 16(up)
- ld r23, 24(up)
- addi up, up, 32
- mulld r24, r6, r6
- mulhdu r25, r6, r6
- mulld r26, r7, r7
- mulhdu r27, r7, r7
- mulld r28, r12, r12
- mulhdu r29, r12, r12
- mulld r30, r23, r23
- mulhdu r31, r23, r23
- ld r8, 0(tp)
- ld r9, 8(tp)
- adde r8, r8, r8
- adde r9, r9, r9
- ld r10, 16(tp)
- ld r11, 24(tp)
- adde r10, r10, r10
- adde r11, r11, r11
- ld r6, 32(tp)
- ld r7, 40(tp)
- adde r6, r6, r6
- adde r7, r7, r7
- ld r12, 48(tp)
- ld r23, 56(tp)
- adde r12, r12, r12
- adde r23, r23, r23
- addi tp, tp, 64
- addze r31, r31
- addc r8, r8, climb
- std r8, 0(rp)
- adde r9, r9, r24
- std r9, 8(rp)
- adde r10, r10, r25
- std r10, 16(rp)
- adde r11, r11, r26
- std r11, 24(rp)
- adde r6, r6, r27
- std r6, 32(rp)
- adde r7, r7, r28
- std r7, 40(rp)
- adde r12, r12, r29
- std r12, 48(rp)
- adde r23, r23, r30
- std r23, 56(rp)
- mr climb, r31
- addi rp, rp, 64
- bdnz L(top)
-
-L(end): addze climb, climb
- std climb, 0(rp)
-
-L(ret): ld r31, -8(r1)
- ld r30, -16(r1)
- ld r29, -24(r1)
- ld r28, -32(r1)
- ld r27, -40(r1)
- ld r26, -48(r1)
- ld r25, -56(r1)
- ld r24, -64(r1)
- ld r23, -72(r1)
- blr
-EPILOGUE()
diff --git a/mpn/powerpc64/rshift.asm b/mpn/powerpc64/rshift.asm
index 6545af769..18406c57e 100644
--- a/mpn/powerpc64/rshift.asm
+++ b/mpn/powerpc64/rshift.asm
@@ -19,11 +19,12 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630 ?
-C POWER4/PPC970 ?
-C POWER5 2.25
-C POWER6 9.75
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 2.25
+C POWER6 9.75
+C POWER7 2.15
C TODO
C * Try to reduce the number of needed live registers
diff --git a/mpn/powerpc64/tabselect.asm b/mpn/powerpc64/tabselect.asm
new file mode 100644
index 000000000..7d189388b
--- /dev/null
+++ b/mpn/powerpc64/tabselect.asm
@@ -0,0 +1,96 @@
+dnl PowerPC-64 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 3.3
+C POWER5 ?
+C POWER6 ?
+C POWER7 2.5
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using VMX could result in significant speedup for certain CPUs.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `r3')
+define(`tp', `r4')
+define(`n', `r5')
+define(`nents', `r6')
+define(`which', `r7')
+
+define(`mask', `r8')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ addi r0, n, 1
+ srdi r0, r0, 1 C inner loop count
+ andi. r9, n, 1 C set cr0 for use in inner loop
+ subf which, nents, which
+ sldi n, n, 3
+
+L(outer):
+ mtctr r0 C put inner loop count in ctr
+
+ add r9, which, nents C are we at the selected table entry?
+ addic r9, r9, -1 C set CF iff not selected entry
+ subfe mask, r0, r0
+
+ beq cr0, L(top) C branch to loop entry if n even
+
+ ld r9, 0(tp)
+ addi tp, tp, 8
+ and r9, r9, mask
+ ld r11, 0(rp)
+ andc r11, r11, mask
+ or r9, r9, r11
+ std r9, 0(rp)
+ addi rp, rp, 8
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r9, 0(tp)
+ ld r10, 8(tp)
+ addi tp, tp, 16
+ nop
+ and r9, r9, mask
+ and r10, r10, mask
+ ld r11, 0(rp)
+ ld r12, 8(rp)
+ andc r11, r11, mask
+ andc r12, r12, mask
+ or r9, r9, r11
+ or r10, r10, r12
+ std r9, 0(rp)
+ std r10, 8(rp)
+ addi rp, rp, 16
+ bdnz L(top)
+
+L(end): subf rp, n, rp C move rp back to beginning
+ cmpdi cr6, nents, 1
+ addi nents, nents, -1
+ bne cr6, L(outer)
+
+ blr
+EPILOGUE()
diff --git a/mpn/s390_32/esame/gmp-mparam.h b/mpn/s390_32/esame/gmp-mparam.h
index 5dedeeb81..a6508be1a 100644
--- a/mpn/s390_32/esame/gmp-mparam.h
+++ b/mpn/s390_32/esame/gmp-mparam.h
@@ -24,43 +24,45 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* 1200 MHz IBM z990 running in 32-bit mode */
#define DIVREM_1_NORM_THRESHOLD 0 /* always */
-#define DIVREM_1_UNNORM_THRESHOLD 3
-#define MOD_1_1P_METHOD 1
+#define DIVREM_1_UNNORM_THRESHOLD 4
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 3
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 12
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 21
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 17
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 42
#define USE_PREINV_DIVREM_1 1
#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 50
+#define BMOD_1_TO_MOD_1_THRESHOLD 30
#define MUL_TOOM22_THRESHOLD 16
-#define MUL_TOOM33_THRESHOLD 66
-#define MUL_TOOM44_THRESHOLD 169
-#define MUL_TOOM6H_THRESHOLD 369
-#define MUL_TOOM8H_THRESHOLD 517
+#define MUL_TOOM33_THRESHOLD 57
+#define MUL_TOOM44_THRESHOLD 147
+#define MUL_TOOM6H_THRESHOLD 226
+#define MUL_TOOM8H_THRESHOLD 333
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 106
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 187
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 100
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 28
-#define SQR_TOOM3_THRESHOLD 93
-#define SQR_TOOM4_THRESHOLD 387
-#define SQR_TOOM6_THRESHOLD 552
-#define SQR_TOOM8_THRESHOLD 0 /* always */
+#define SQR_TOOM2_THRESHOLD 26
+#define SQR_TOOM3_THRESHOLD 81
+#define SQR_TOOM4_THRESHOLD 154
+#define SQR_TOOM6_THRESHOLD 318
+#define SQR_TOOM8_THRESHOLD 478
#define MULMID_TOOM42_THRESHOLD 38
#define MULMOD_BNM1_THRESHOLD 13
#define SQRMOD_BNM1_THRESHOLD 15
+#define POWM_SEC_TABLE 4,23,262,892,2500
+
#define MUL_FFT_MODF_THRESHOLD 336 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 336, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \
@@ -91,37 +93,37 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 35
#define SQR_FFT_THRESHOLD 2368
-#define MULLO_BASECASE_THRESHOLD 6
-#define MULLO_DC_THRESHOLD 45
+#define MULLO_BASECASE_THRESHOLD 5
+#define MULLO_DC_THRESHOLD 49
#define MULLO_MUL_N_THRESHOLD 5397
-#define DC_DIV_QR_THRESHOLD 40
-#define DC_DIVAPPR_Q_THRESHOLD 152
+#define DC_DIV_QR_THRESHOLD 42
+#define DC_DIVAPPR_Q_THRESHOLD 146
#define DC_BDIV_QR_THRESHOLD 51
-#define DC_BDIV_Q_THRESHOLD 136
+#define DC_BDIV_Q_THRESHOLD 124
#define INV_MULMOD_BNM1_THRESHOLD 46
-#define INV_NEWTON_THRESHOLD 197
-#define INV_APPR_THRESHOLD 157
+#define INV_NEWTON_THRESHOLD 179
+#define INV_APPR_THRESHOLD 153
-#define BINV_NEWTON_THRESHOLD 114
+#define BINV_NEWTON_THRESHOLD 214
#define REDC_1_TO_REDC_N_THRESHOLD 55
-#define MU_DIV_QR_THRESHOLD 1210
-#define MU_DIVAPPR_Q_THRESHOLD 1334
-#define MUPI_DIV_QR_THRESHOLD 81
-#define MU_BDIV_QR_THRESHOLD 942
-#define MU_BDIV_Q_THRESHOLD 1258
-
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 104
-#define GCD_DC_THRESHOLD 278
+#define MU_DIV_QR_THRESHOLD 1078
+#define MU_DIVAPPR_Q_THRESHOLD 1078
+#define MUPI_DIV_QR_THRESHOLD 74
+#define MU_BDIV_QR_THRESHOLD 872
+#define MU_BDIV_Q_THRESHOLD 1078
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD_THRESHOLD 90
+#define HGCD_APPR_THRESHOLD 111
+#define HGCD_REDUCE_THRESHOLD 1962
+#define GCD_DC_THRESHOLD 225
#define GCDEXT_DC_THRESHOLD 217
#define JACOBI_BASE_METHOD 2
-#define GET_STR_DC_THRESHOLD 16
-#define GET_STR_PRECOMPUTE_THRESHOLD 30
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 27
#define SET_STR_DC_THRESHOLD 274
#define SET_STR_PRECOMPUTE_THRESHOLD 824
-
-/* Tuneup completed successfully, took 108 seconds */
diff --git a/mpn/s390_32/lshift.asm b/mpn/s390_32/lshift.asm
index 335a5f77a..17e52655f 100644
--- a/mpn/s390_32/lshift.asm
+++ b/mpn/s390_32/lshift.asm
@@ -126,7 +126,7 @@ L(top): l %r10, 16(up)
L(end): l %r10, 16(up)
sll %r10, 0(cnt)
st %r10, 12(rp)
-
+
lr %r2, %r12
lm %r6, %r12, 24(%r15)
br %r14
diff --git a/mpn/s390_32/lshiftc.asm b/mpn/s390_32/lshiftc.asm
index b42bc715b..9bdd0d798 100644
--- a/mpn/s390_32/lshiftc.asm
+++ b/mpn/s390_32/lshiftc.asm
@@ -138,7 +138,7 @@ L(end): l %r10, 16(up)
sll %r10, 0(cnt)
xr %r10, %r13
st %r10, 12(rp)
-
+
lr %r2, %r12
lm %r6, %r13, 24(%r15)
br %r14
diff --git a/mpn/s390_32/rshift.asm b/mpn/s390_32/rshift.asm
index ec32fa764..becbe1893 100644
--- a/mpn/s390_32/rshift.asm
+++ b/mpn/s390_32/rshift.asm
@@ -120,7 +120,7 @@ L(top): l %r11, 0(up)
L(end): l %r11, 0(up)
srl %r11, 0(cnt)
st %r11, 0(rp)
-
+
lr %r2, %r12
lm %r6, %r12, 24(%r15)
br %r14
diff --git a/mpn/s390_64/README b/mpn/s390_64/README
new file mode 100644
index 000000000..82b68a080
--- /dev/null
+++ b/mpn/s390_64/README
@@ -0,0 +1,77 @@
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+
+
+There are 5 generations of 64-but s390 processors, z900, z990, z9,
+z10, and z196. The current GMP code was optimised for the two oldest,
+z900 and z990.
+
+
+mpn_copyi
+
+This code makes use of a loop around MVC. It almost surely runs very
+close to optimally. A small improvement could be done by using one
+MVC for size 256 bytes, now we use two (we use an extra MVC when
+copying any multiple of 256 bytes).
+
+
+mpn_copyd
+
+We have tried several feed-in variants here, branch tree, jump table
+and computed goto. The fastest (on z990) turned out to be computed
+goto.
+
+An approach not tried is EX of LMG and STMG, modifying the register set
+on-the-fly. Using that trick, we could completely avoid using
+separate feed-in paths.
+
+
+mpn_lshift, mpn_rshift
+
+The current code runs at pipeline decode bandwith on z990.
+
+
+mpn_add_n, mpn_sub_n
+
+The current code is 4-way unrolled. It should be unrolled more, at
+least 8x, in order to reach 2.5 c/l.
+
+
+mpn_mul_1, mpn_addmul_1, mpn_submul_1
+
+The current code is very naive, but due to the non-pipelined nature of
+MLGR on z900 and z990, more sophisticated code would not gain much.
+
+On z10 one would need to cluster at least 4 MLGR together, in order to
+reduce stalling.
+
+On z196, one surely want to use unrolling and pipelining, to perhaps
+reach around 12 c/l. A major issue here and on z10 is ALCGR's 3 cycle
+stalling.
+
+
+mpn_mul_2, mpn_addmul_2
+
+At least for older machines (z900, z990) with very slow MLGR, we
+should use Karatsuba's algorithm on 2-limb units, making mul_2 and
+addmul_2 the main multiplicaton primitives. The newer machines might
+benefit less from this approach, perhaps in particular z10, where MLGR
+clustering is more important.
+
+With Karatsuba, one could hope for around 16 cycles per accumulated
+128 cross product, on z990.
diff --git a/mpn/s390_64/gmp-mparam.h b/mpn/s390_64/gmp-mparam.h
index c4960254e..c0ade71c2 100644
--- a/mpn/s390_64/gmp-mparam.h
+++ b/mpn/s390_64/gmp-mparam.h
@@ -28,19 +28,19 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 58
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 38
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 19
#define USE_PREINV_DIVREM_1 1
#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
-#define BMOD_1_TO_MOD_1_THRESHOLD 47
+#define BMOD_1_TO_MOD_1_THRESHOLD 88
#define MUL_TOOM22_THRESHOLD 10
#define MUL_TOOM33_THRESHOLD 41
-#define MUL_TOOM44_THRESHOLD 99
+#define MUL_TOOM44_THRESHOLD 104
#define MUL_TOOM6H_THRESHOLD 149
#define MUL_TOOM8H_THRESHOLD 212
@@ -61,6 +61,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULMOD_BNM1_THRESHOLD 9
#define SQRMOD_BNM1_THRESHOLD 11
+#define POWM_SEC_TABLE 4,23,128,598
+
#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 220, 5}, { 7, 4}, { 15, 5}, { 8, 4}, \
@@ -131,7 +133,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULLO_BASECASE_THRESHOLD 3
#define MULLO_DC_THRESHOLD 33
-#define MULLO_MUL_N_THRESHOLD 4392
+#define MULLO_MUL_N_THRESHOLD 5240
#define DC_DIV_QR_THRESHOLD 28
#define DC_DIVAPPR_Q_THRESHOLD 106
@@ -152,12 +154,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 680
#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 71
-#define GCD_DC_THRESHOLD 177
-#define GCDEXT_DC_THRESHOLD 142
-#define JACOBI_BASE_METHOD 2
+#define HGCD_THRESHOLD 75
+#define HGCD_APPR_THRESHOLD 59
+#define HGCD_REDUCE_THRESHOLD 901
+#define GCD_DC_THRESHOLD 186
+#define GCDEXT_DC_THRESHOLD 150
+#define JACOBI_BASE_METHOD 3
#define GET_STR_DC_THRESHOLD 27
#define GET_STR_PRECOMPUTE_THRESHOLD 40
-#define SET_STR_DC_THRESHOLD 363
+#define SET_STR_DC_THRESHOLD 418
#define SET_STR_PRECOMPUTE_THRESHOLD 1111
diff --git a/mpn/sparc64/ultrasparc34/gmp-mparam.h b/mpn/sparc64/ultrasparc34/gmp-mparam.h
index faed8efa3..8fe8ddc54 100644
--- a/mpn/sparc64/ultrasparc34/gmp-mparam.h
+++ b/mpn/sparc64/ultrasparc34/gmp-mparam.h
@@ -28,12 +28,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 38
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 24
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 33
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22
#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
@@ -55,8 +56,12 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_TOOM6_THRESHOLD 191
#define SQR_TOOM8_THRESHOLD 339
-#define MULMOD_BNM1_THRESHOLD 14
-#define SQRMOD_BNM1_THRESHOLD 13
+#define MULMID_TOOM42_THRESHOLD 42
+
+#define MULMOD_BNM1_THRESHOLD 16
+#define SQRMOD_BNM1_THRESHOLD 9
+
+#define POWM_SEC_TABLE 4,23,130,780,1812,1926
#define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -157,7 +162,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 182
#define SQR_FFT_THRESHOLD 1984
-#define MULLO_BASECASE_THRESHOLD 8
+#define MULLO_BASECASE_THRESHOLD 14
#define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */
#define MULLO_MUL_N_THRESHOLD 3791
@@ -170,7 +175,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define INV_NEWTON_THRESHOLD 17
#define INV_APPR_THRESHOLD 17
-#define BINV_NEWTON_THRESHOLD 134
+#define BINV_NEWTON_THRESHOLD 92
#define REDC_1_TO_REDC_2_THRESHOLD 2
#define REDC_2_TO_REDC_N_THRESHOLD 117
@@ -181,12 +186,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 748
#define MATRIX22_STRASSEN_THRESHOLD 12
-#define HGCD_THRESHOLD 46
-#define GCD_DC_THRESHOLD 130
+#define HGCD_THRESHOLD 45
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 1094
+#define GCD_DC_THRESHOLD 126
#define GCDEXT_DC_THRESHOLD 134
#define JACOBI_BASE_METHOD 2
#define GET_STR_DC_THRESHOLD 18
#define GET_STR_PRECOMPUTE_THRESHOLD 27
-#define SET_STR_DC_THRESHOLD 315
+#define SET_STR_DC_THRESHOLD 286
#define SET_STR_PRECOMPUTE_THRESHOLD 1037
diff --git a/mpn/sparc64/ultrasparct1/gmp-mparam.h b/mpn/sparc64/ultrasparct1/gmp-mparam.h
index 744f7e17c..34c8027f5 100644
--- a/mpn/sparc64/ultrasparct1/gmp-mparam.h
+++ b/mpn/sparc64/ultrasparct1/gmp-mparam.h
@@ -25,14 +25,16 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DIVREM_1_NORM_THRESHOLD 0 /* always */
#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1P_METHOD 2
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
#define MOD_1U_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 35
#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always */
#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
@@ -50,13 +52,17 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_BASECASE_THRESHOLD 0 /* always */
#define SQR_TOOM2_THRESHOLD 16
#define SQR_TOOM3_THRESHOLD 57
-#define SQR_TOOM4_THRESHOLD 133
-#define SQR_TOOM6_THRESHOLD 156
+#define SQR_TOOM4_THRESHOLD 135
+#define SQR_TOOM6_THRESHOLD 160
#define SQR_TOOM8_THRESHOLD 260
+#define MULMID_TOOM42_THRESHOLD 12
+
#define MULMOD_BNM1_THRESHOLD 7
#define SQRMOD_BNM1_THRESHOLD 7
+#define POWM_SEC_TABLE 2,23,176,625,2783
+
#define MUL_FFT_MODF_THRESHOLD 176 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 176, 5}, { 7, 6}, { 4, 5}, { 9, 6}, \
@@ -102,30 +108,32 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULLO_MUL_N_THRESHOLD 3176
#define DC_DIV_QR_THRESHOLD 27
-#define DC_DIVAPPR_Q_THRESHOLD 107
+#define DC_DIVAPPR_Q_THRESHOLD 108
#define DC_BDIV_QR_THRESHOLD 27
#define DC_BDIV_Q_THRESHOLD 62
-#define INV_MULMOD_BNM1_THRESHOLD 22
+#define INV_MULMOD_BNM1_THRESHOLD 14
#define INV_NEWTON_THRESHOLD 163
#define INV_APPR_THRESHOLD 117
#define BINV_NEWTON_THRESHOLD 166
#define REDC_1_TO_REDC_N_THRESHOLD 32
-#define MU_DIV_QR_THRESHOLD 720
-#define MU_DIVAPPR_Q_THRESHOLD 734
-#define MUPI_DIV_QR_THRESHOLD 67
+#define MU_DIV_QR_THRESHOLD 734
+#define MU_DIVAPPR_Q_THRESHOLD 748
+#define MUPI_DIV_QR_THRESHOLD 68
#define MU_BDIV_QR_THRESHOLD 562
#define MU_BDIV_Q_THRESHOLD 734
-#define MATRIX22_STRASSEN_THRESHOLD 11
-#define HGCD_THRESHOLD 53
+#define MATRIX22_STRASSEN_THRESHOLD 9
+#define HGCD_THRESHOLD 66
+#define HGCD_APPR_THRESHOLD 47
+#define HGCD_REDUCE_THRESHOLD 834
#define GCD_DC_THRESHOLD 183
-#define GCDEXT_DC_THRESHOLD 144
+#define GCDEXT_DC_THRESHOLD 142
#define JACOBI_BASE_METHOD 3
#define GET_STR_DC_THRESHOLD 20
-#define GET_STR_PRECOMPUTE_THRESHOLD 39
+#define GET_STR_PRECOMPUTE_THRESHOLD 36
#define SET_STR_DC_THRESHOLD 458
-#define SET_STR_PRECOMPUTE_THRESHOLD 964
+#define SET_STR_PRECOMPUTE_THRESHOLD 963
diff --git a/mpn/x86/atom/gmp-mparam.h b/mpn/x86/atom/gmp-mparam.h
index 8c2595230..391a0ac4a 100644
--- a/mpn/x86/atom/gmp-mparam.h
+++ b/mpn/x86/atom/gmp-mparam.h
@@ -24,26 +24,27 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* Generated by tuneup.c */
#define MOD_1_NORM_THRESHOLD 3
-#define MOD_1_UNNORM_THRESHOLD 6
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 33
#define MUL_TOOM22_THRESHOLD 20
#define MUL_TOOM33_THRESHOLD 78
-#define MUL_TOOM44_THRESHOLD 184
+#define MUL_TOOM44_THRESHOLD 168
#define MUL_TOOM6H_THRESHOLD 270
#define MUL_TOOM8H_THRESHOLD 406
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 79
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 126
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 121
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 127
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 34
@@ -52,8 +53,12 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_TOOM6_THRESHOLD 303
#define SQR_TOOM8_THRESHOLD 547
-#define MULMOD_BNM1_THRESHOLD 14
-#define SQRMOD_BNM1_THRESHOLD 18
+#define MULMID_TOOM42_THRESHOLD 54
+
+#define MULMOD_BNM1_THRESHOLD 16
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 2,35,262,1168
#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -108,9 +113,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 82
#define SQR_FFT_THRESHOLD 2752
-#define MULLO_BASECASE_THRESHOLD 4
+#define MULLO_BASECASE_THRESHOLD 5
#define MULLO_DC_THRESHOLD 51
-#define MULLO_MUL_N_THRESHOLD 8907
+#define MULLO_MUL_N_THRESHOLD 6633
#define DC_DIV_QR_THRESHOLD 63
#define DC_DIVAPPR_Q_THRESHOLD 252
@@ -131,12 +136,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 1334
#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 126
-#define GCD_DC_THRESHOLD 483
-#define GCDEXT_DC_THRESHOLD 351
+#define HGCD_THRESHOLD 129
+#define HGCD_APPR_THRESHOLD 163
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 469
+#define GCDEXT_DC_THRESHOLD 348
#define JACOBI_BASE_METHOD 3
#define GET_STR_DC_THRESHOLD 13
#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 272
-#define SET_STR_PRECOMPUTE_THRESHOLD 1116
+#define SET_STR_DC_THRESHOLD 262
+#define SET_STR_PRECOMPUTE_THRESHOLD 902
diff --git a/mpn/x86/atom/lshift.asm b/mpn/x86/atom/lshift.asm
index d8cb8b505..1005cce59 100644
--- a/mpn/x86/atom/lshift.asm
+++ b/mpn/x86/atom/lshift.asm
@@ -160,7 +160,7 @@ deflit(`FRAME',4)
shr $2, %eax C (size + 3) / 4
and $3, %edx C (size - 1) % 4
jz L(goloop) C jmp if size == 1 (mod 4)
- shr %edx
+ shr %edx
jnc L(odd) C jum if size == 3 (mod 4)
add %ecx, %ecx
@@ -173,7 +173,7 @@ deflit(`FRAME',4)
jnz L(goloop) C jump if size == 0 (mod 4)
L(odd): lea -8(up), up
lea -8(rp), rp
- jmp L(sentry) C reached if size == 2 or 3 (mod 4)
+ jmp L(sentry) C reached if size == 2 or 3 (mod 4)
L(sloop):
adc %ecx, %ecx
diff --git a/mpn/x86/atom/sse2/mul_1.asm b/mpn/x86/atom/sse2/mul_1.asm
index dd9b95366..5cd86caec 100644
--- a/mpn/x86/atom/sse2/mul_1.asm
+++ b/mpn/x86/atom/sse2/mul_1.asm
@@ -62,7 +62,7 @@ EPILOGUE()
PROLOGUE(mpn_mul_1)
pxor %mm6, %mm6
L(ent): push %esi FRAME_pushl()
- mov PARAM_SRC, up
+ mov PARAM_SRC, up
mov PARAM_SIZE, %eax C size
movd PARAM_MUL, %mm7
movd (up), %mm0
diff --git a/mpn/x86/bdiv_dbm1c.asm b/mpn/x86/bdiv_dbm1c.asm
index 201ef173d..ac9faf270 100644
--- a/mpn/x86/bdiv_dbm1c.asm
+++ b/mpn/x86/bdiv_dbm1c.asm
@@ -24,10 +24,10 @@ C P5
C P6 model 0-8,10-12)
C P6 model 9 (Banias)
C P6 model 13 (Dothan) 5.1
-C P4 model 0 (Willamette)
+C P4 model 0 (Willamette)
C P4 model 1 (?)
C P4 model 2 (Northwood) 13.67
-C P4 model 3 (Prescott)
+C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
C Intel Atom
C AMD K6
diff --git a/mpn/x86/bdiv_q_1.asm b/mpn/x86/bdiv_q_1.asm
index 2528d01f7..7f344ab57 100644
--- a/mpn/x86/bdiv_q_1.asm
+++ b/mpn/x86/bdiv_q_1.asm
@@ -30,7 +30,7 @@ C K6 14.0
C K7 12.0
C P4 42.0
-MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
defframe(PARAM_SHIFT, 24)
defframe(PARAM_INVERSE,20)
diff --git a/mpn/x86/bobcat/gmp-mparam.h b/mpn/x86/bobcat/gmp-mparam.h
new file mode 100644
index 000000000..e14ba39f5
--- /dev/null
+++ b/mpn/x86/bobcat/gmp-mparam.h
@@ -0,0 +1,142 @@
+/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 23
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 42
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 90
+#define MUL_TOOM44_THRESHOLD 147
+#define MUL_TOOM6H_THRESHOLD 274
+#define MUL_TOOM8H_THRESHOLD 454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 38
+#define SQR_TOOM3_THRESHOLD 89
+#define SQR_TOOM4_THRESHOLD 220
+#define SQR_TOOM6_THRESHOLD 303
+#define SQR_TOOM8_THRESHOLD 454
+
+#define MULMID_TOOM42_THRESHOLD 76
+
+#define MULMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define POWM_SEC_TABLE 4,14,290,357,2178
+
+#define MUL_FFT_MODF_THRESHOLD 888 /* k = 6 */
+#define MUL_FFT_TABLE3 \
+ { { 888, 6}, { 25, 7}, { 13, 6}, { 27, 7}, \
+ { 15, 6}, { 33, 7}, { 17, 6}, { 35, 7}, \
+ { 19, 6}, { 39, 7}, { 23, 6}, { 47, 7}, \
+ { 27, 8}, { 15, 7}, { 31, 6}, { 63, 7}, \
+ { 35, 8}, { 19, 7}, { 41, 8}, { 23, 7}, \
+ { 49, 8}, { 31, 7}, { 63, 8}, { 39, 7}, \
+ { 79, 8}, { 43, 9}, { 23, 8}, { 51, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \
+ { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \
+ { 159,10}, { 319, 9}, { 671,11}, { 191,10}, \
+ { 383, 9}, { 767,12}, { 4096,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 70
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 723 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 723, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \
+ { 15, 5}, { 31, 6}, { 27, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 6}, { 63, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 47, 7}, { 95, 8}, { 51, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
+ { 95, 9}, { 191,11}, { 63,10}, { 127, 9}, \
+ { 255,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 543,11}, \
+ { 159, 9}, { 671,11}, { 191,10}, { 383, 9}, \
+ { 799,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 69
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 5
+#define MULLO_DC_THRESHOLD 45
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 75
+#define DC_DIVAPPR_Q_THRESHOLD 216
+#define DC_BDIV_QR_THRESHOLD 67
+#define DC_BDIV_Q_THRESHOLD 143
+
+#define INV_MULMOD_BNM1_THRESHOLD 75
+#define INV_NEWTON_THRESHOLD 244
+#define INV_APPR_THRESHOLD 228
+
+#define BINV_NEWTON_THRESHOLD 276
+#define REDC_1_TO_REDC_N_THRESHOLD 71
+
+#define MU_DIV_QR_THRESHOLD 1858
+#define MU_DIVAPPR_Q_THRESHOLD 1822
+#define MUPI_DIV_QR_THRESHOLD 122
+#define MU_BDIV_QR_THRESHOLD 1787
+#define MU_BDIV_Q_THRESHOLD 1787
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD_THRESHOLD 78
+#define HGCD_APPR_THRESHOLD 55
+#define HGCD_REDUCE_THRESHOLD 4633
+#define GCD_DC_THRESHOLD 474
+#define GCDEXT_DC_THRESHOLD 345
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 31
+#define SET_STR_DC_THRESHOLD 270
+#define SET_STR_PRECOMPUTE_THRESHOLD 812
diff --git a/mpn/x86/core2/gmp-mparam.h b/mpn/x86/core2/gmp-mparam.h
new file mode 100644
index 000000000..feb0f281f
--- /dev/null
+++ b/mpn/x86/core2/gmp-mparam.h
@@ -0,0 +1,141 @@
+/* x86/core2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 19
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 93
+#define MUL_TOOM44_THRESHOLD 228
+#define MUL_TOOM6H_THRESHOLD 294
+#define MUL_TOOM8H_THRESHOLD 458
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 90
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 34
+#define SQR_TOOM3_THRESHOLD 116
+#define SQR_TOOM4_THRESHOLD 178
+#define SQR_TOOM6_THRESHOLD 262
+#define SQR_TOOM8_THRESHOLD 597
+
+#define MULMID_TOOM42_THRESHOLD 70
+
+#define MULMOD_BNM1_THRESHOLD 20
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define POWM_SEC_TABLE 6,26,262,991,2212
+
+#define MUL_FFT_MODF_THRESHOLD 690 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 690, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 15, 5}, { 31, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
+ { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \
+ { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \
+ { 319, 9}, { 639,11}, { 191,10}, { 383, 9}, \
+ { 799,11}, { 223,12}, { 4096,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 70
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 630 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 630, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \
+ { 15, 5}, { 31, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 127,10}, { 79, 9}, { 159,10}, { 95,11}, \
+ { 63,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 271, 9}, { 543,11}, \
+ { 159,10}, { 319, 9}, { 671, 8}, { 1343,11}, \
+ { 191,10}, { 383, 9}, { 799,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 67
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 30
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 15
+#define DC_DIVAPPR_Q_THRESHOLD 49
+#define DC_BDIV_QR_THRESHOLD 76
+#define DC_BDIV_Q_THRESHOLD 190
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 35
+#define INV_APPR_THRESHOLD 35
+
+#define BINV_NEWTON_THRESHOLD 324
+#define REDC_1_TO_REDC_N_THRESHOLD 83
+
+#define MU_DIV_QR_THRESHOLD 1442
+#define MU_DIVAPPR_Q_THRESHOLD 1099
+#define MUPI_DIV_QR_THRESHOLD 0 /* always */
+#define MU_BDIV_QR_THRESHOLD 1589
+#define MU_BDIV_Q_THRESHOLD 1718
+
+#define MATRIX22_STRASSEN_THRESHOLD 31
+#define HGCD_THRESHOLD 118
+#define HGCD_APPR_THRESHOLD 149
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 351
+#define GCDEXT_DC_THRESHOLD 309
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 517
+#define SET_STR_PRECOMPUTE_THRESHOLD 1402
diff --git a/mpn/x86/coreinhm/gmp-mparam.h b/mpn/x86/coreinhm/gmp-mparam.h
new file mode 100644
index 000000000..21afeb619
--- /dev/null
+++ b/mpn/x86/coreinhm/gmp-mparam.h
@@ -0,0 +1,141 @@
+/* x86/coreinhm gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.5 */
+
+#define MOD_1_NORM_THRESHOLD 24
+#define MOD_1_UNNORM_THRESHOLD 15
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 16
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 214
+#define MUL_TOOM6H_THRESHOLD 306
+#define MUL_TOOM8H_THRESHOLD 454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 137
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 148
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 131
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 42
+#define SQR_TOOM3_THRESHOLD 149
+#define SQR_TOOM4_THRESHOLD 226
+#define SQR_TOOM6_THRESHOLD 333
+#define SQR_TOOM8_THRESHOLD 494
+
+#define MULMID_TOOM42_THRESHOLD 78
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 21
+
+#define POWM_SEC_TABLE 2,33,294,1298,2870
+
+#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 606, 5}, { 28, 6}, { 15, 5}, { 33, 6}, \
+ { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 29, 8}, { 15, 7}, { 37, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
+ { 95, 9}, { 191,11}, { 63,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 63
+#define MUL_FFT_THRESHOLD 6784
+
+#define SQR_FFT_MODF_THRESHOLD 505 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 505, 5}, { 28, 6}, { 15, 5}, { 33, 6}, \
+ { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 7}, { 55, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95,11}, { 63,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,10}, { 415,12}, { 4096,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 74
+#define SQR_FFT_THRESHOLD 4800
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 35
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 21
+#define DC_DIVAPPR_Q_THRESHOLD 42
+#define DC_BDIV_QR_THRESHOLD 84
+#define DC_BDIV_Q_THRESHOLD 156
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 17
+#define INV_APPR_THRESHOLD 17
+
+#define BINV_NEWTON_THRESHOLD 348
+#define REDC_1_TO_REDC_N_THRESHOLD 83
+
+#define MU_DIV_QR_THRESHOLD 979
+#define MU_DIVAPPR_Q_THRESHOLD 501
+#define MUPI_DIV_QR_THRESHOLD 0 /* always */
+#define MU_BDIV_QR_THRESHOLD 1589
+#define MU_BDIV_Q_THRESHOLD 1787
+
+#define MATRIX22_STRASSEN_THRESHOLD 20
+#define HGCD_THRESHOLD 57
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 253
+#define GCDEXT_DC_THRESHOLD 233
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 127
+#define SET_STR_PRECOMPUTE_THRESHOLD 646
diff --git a/mpn/x86/coreisbr/gmp-mparam.h b/mpn/x86/coreisbr/gmp-mparam.h
new file mode 100644
index 000000000..16ef958ad
--- /dev/null
+++ b/mpn/x86/coreisbr/gmp-mparam.h
@@ -0,0 +1,140 @@
+/* x86/coreisbr gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-24, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 24
+#define MOD_1_UNNORM_THRESHOLD 25
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 3
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 18
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 101
+#define MUL_TOOM44_THRESHOLD 244
+#define MUL_TOOM6H_THRESHOLD 351
+#define MUL_TOOM8H_THRESHOLD 547
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 109
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 183
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 109
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 109
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 48
+#define SQR_TOOM3_THRESHOLD 165
+#define SQR_TOOM4_THRESHOLD 276
+#define SQR_TOOM6_THRESHOLD 366
+#define SQR_TOOM8_THRESHOLD 572
+
+#define MULMID_TOOM42_THRESHOLD 98
+
+#define MULMOD_BNM1_THRESHOLD 20
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define POWM_SEC_TABLE 2,27,258,1052
+
+#define MUL_FFT_MODF_THRESHOLD 716 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 716, 5}, { 27, 6}, { 15, 5}, { 31, 6}, \
+ { 27, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 31, 6}, \
+ { 63, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 51, 8}, { 31, 7}, { 63, 8}, \
+ { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \
+ { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 159,11}, { 95,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271,11}, { 159,10}, { 319, 9}, \
+ { 639,11}, { 191,10}, { 383, 9}, { 767,11}, \
+ { 223,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 69
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 595 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 595, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
+ { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \
+ { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \
+ { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95,11}, { 63,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \
+ { 159,10}, { 319, 9}, { 671,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 63
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 100
+#define MULLO_MUL_N_THRESHOLD 14379
+
+#define DC_DIV_QR_THRESHOLD 22
+#define DC_DIVAPPR_Q_THRESHOLD 30
+#define DC_BDIV_QR_THRESHOLD 120
+#define DC_BDIV_Q_THRESHOLD 268
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 12
+#define INV_APPR_THRESHOLD 13
+
+#define BINV_NEWTON_THRESHOLD 410
+#define REDC_1_TO_REDC_N_THRESHOLD 100
+
+#define MU_DIV_QR_THRESHOLD 1037
+#define MU_DIVAPPR_Q_THRESHOLD 889
+#define MUPI_DIV_QR_THRESHOLD 0 /* always */
+#define MU_BDIV_QR_THRESHOLD 1858
+#define MU_BDIV_Q_THRESHOLD 2172
+
+#define MATRIX22_STRASSEN_THRESHOLD 21
+#define HGCD_THRESHOLD 59
+#define HGCD_APPR_THRESHOLD 56
+#define HGCD_REDUCE_THRESHOLD 4818
+#define GCD_DC_THRESHOLD 278
+#define GCDEXT_DC_THRESHOLD 298
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 23
+#define SET_STR_DC_THRESHOLD 438
+#define SET_STR_PRECOMPUTE_THRESHOLD 1206
diff --git a/mpn/x86/k10/gmp-mparam.h b/mpn/x86/k10/gmp-mparam.h
new file mode 100644
index 000000000..5c036223c
--- /dev/null
+++ b/mpn/x86/k10/gmp-mparam.h
@@ -0,0 +1,142 @@
+/* x86/k10 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 12
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 32
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 77
+#define MUL_TOOM44_THRESHOLD 127
+#define MUL_TOOM6H_THRESHOLD 270
+#define MUL_TOOM8H_THRESHOLD 357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 77
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 90
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 154
+#define SQR_TOOM6_THRESHOLD 336
+#define SQR_TOOM8_THRESHOLD 527
+
+#define MULMID_TOOM42_THRESHOLD 54
+
+#define MULMOD_BNM1_THRESHOLD 15
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define POWM_SEC_TABLE 4,32,164,879,2178
+
+#define MUL_FFT_MODF_THRESHOLD 786 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 786, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 6}, { 63, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \
+ { 63, 9}, { 39, 8}, { 83, 9}, { 47,10}, \
+ { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \
+ { 255, 7}, { 1023, 8}, { 543, 9}, { 279,10}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543, 8}, { 1087,10}, { 287,11}, { 159, 9}, \
+ { 671,11}, { 191,10}, { 399, 9}, { 799,12}, \
+ { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 76
+#define MUL_FFT_THRESHOLD 7424
+
+#define SQR_FFT_MODF_THRESHOLD 660 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 660, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \
+ { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 8}, { 31, 7}, { 63, 8}, \
+ { 35, 7}, { 71, 8}, { 39, 9}, { 23, 8}, \
+ { 55,10}, { 15, 9}, { 31, 8}, { 63, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 167,10}, { 95,11}, { 63,10}, \
+ { 159,11}, { 95, 8}, { 799,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 543,11}, { 159, 9}, \
+ { 639,10}, { 367,11}, { 191,10}, { 383, 9}, \
+ { 799,10}, { 415,11}, { 223,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 67
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 6
+#define MULLO_DC_THRESHOLD 42
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 56
+#define DC_DIVAPPR_Q_THRESHOLD 248
+#define DC_BDIV_QR_THRESHOLD 55
+#define DC_BDIV_Q_THRESHOLD 160
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 250
+#define INV_APPR_THRESHOLD 250
+
+#define BINV_NEWTON_THRESHOLD 276
+#define REDC_1_TO_REDC_N_THRESHOLD 67
+
+#define MU_DIV_QR_THRESHOLD 1718
+#define MU_DIVAPPR_Q_THRESHOLD 1652
+#define MUPI_DIV_QR_THRESHOLD 114
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1589
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD_THRESHOLD 131
+#define HGCD_APPR_THRESHOLD 163
+#define HGCD_REDUCE_THRESHOLD 3810
+#define GCD_DC_THRESHOLD 555
+#define GCDEXT_DC_THRESHOLD 389
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 28
+#define SET_STR_DC_THRESHOLD 140
+#define SET_STR_PRECOMPUTE_THRESHOLD 1334
diff --git a/mpn/x86/k7/addlsh1_n.asm b/mpn/x86/k7/addlsh1_n.asm
index e5163b676..05df4a740 100644
--- a/mpn/x86/k7/addlsh1_n.asm
+++ b/mpn/x86/k7/addlsh1_n.asm
@@ -44,14 +44,14 @@ C AMD K8
C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately,
C that means we need an initial magic multiply.
-C
+C
C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We
C cannot do rsblsh1_n since we feed carry from the shift blocks to the
C add/subtract blocks, which is right for addition but reversed for
C subtraction. We could perhaps do sublsh1_n, with some extra move insns,
C without losing any time, since we're not issue limited but carry recurrency
C latency.
-C
+C
C Breaking carry recurrency might be a good idea. We would then need separate
C registers for the shift carry and add/subtract carry, which in turn would
C force is to 2*2-way unrolling.
@@ -120,7 +120,7 @@ ifdef(`CPU_P6',`
L(exact):
incl VAR_COUNT
jz L(end)
-
+
ALIGN(16)
L(top):
ifdef(`CPU_P6',`
diff --git a/mpn/x86/k7/gmp-mparam.h b/mpn/x86/k7/gmp-mparam.h
index 84238c4e0..9cc6798af 100644
--- a/mpn/x86/k7/gmp-mparam.h
+++ b/mpn/x86/k7/gmp-mparam.h
@@ -30,6 +30,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 26
@@ -40,19 +41,23 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MUL_TOOM8H_THRESHOLD 454
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 95
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 101
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 50
-#define SQR_TOOM3_THRESHOLD 87
+#define SQR_TOOM3_THRESHOLD 81
#define SQR_TOOM4_THRESHOLD 148
-#define SQR_TOOM6_THRESHOLD 306
+#define SQR_TOOM6_THRESHOLD 274
#define SQR_TOOM8_THRESHOLD 430
+#define MULMID_TOOM42_THRESHOLD 88
+
#define MULMOD_BNM1_THRESHOLD 18
-#define SQRMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 18
+
+#define POWM_SEC_TABLE 2,17,225,961,1604
#define MUL_FFT_MODF_THRESHOLD 888 /* k = 6 */
#define MUL_FFT_TABLE3 \
@@ -155,28 +160,30 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULLO_DC_THRESHOLD 42
#define MULLO_MUL_N_THRESHOLD 13463
-#define DC_DIV_QR_THRESHOLD 89
-#define DC_DIVAPPR_Q_THRESHOLD 315
+#define DC_DIV_QR_THRESHOLD 60
+#define DC_DIVAPPR_Q_THRESHOLD 336
#define DC_BDIV_QR_THRESHOLD 91
-#define DC_BDIV_Q_THRESHOLD 274
+#define DC_BDIV_Q_THRESHOLD 268
#define INV_MULMOD_BNM1_THRESHOLD 66
-#define INV_NEWTON_THRESHOLD 300
-#define INV_APPR_THRESHOLD 303
+#define INV_NEWTON_THRESHOLD 284
+#define INV_APPR_THRESHOLD 284
-#define BINV_NEWTON_THRESHOLD 303
-#define REDC_1_TO_REDC_N_THRESHOLD 95
+#define BINV_NEWTON_THRESHOLD 270
+#define REDC_1_TO_REDC_N_THRESHOLD 87
-#define MU_DIV_QR_THRESHOLD 1858
-#define MU_DIVAPPR_Q_THRESHOLD 1718
-#define MUPI_DIV_QR_THRESHOLD 132
-#define MU_BDIV_QR_THRESHOLD 1387
+#define MU_DIV_QR_THRESHOLD 1752
+#define MU_DIVAPPR_Q_THRESHOLD 1652
+#define MUPI_DIV_QR_THRESHOLD 97
+#define MU_BDIV_QR_THRESHOLD 1470
#define MU_BDIV_Q_THRESHOLD 1470
#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 154
-#define GCD_DC_THRESHOLD 599
-#define GCDEXT_DC_THRESHOLD 443
+#define HGCD_THRESHOLD 173
+#define HGCD_APPR_THRESHOLD 226
+#define HGCD_REDUCE_THRESHOLD 4633
+#define GCD_DC_THRESHOLD 580
+#define GCDEXT_DC_THRESHOLD 414
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 17
diff --git a/mpn/x86/k7/invert_limb.asm b/mpn/x86/k7/invert_limb.asm
index da6f28397..435fa96d0 100644
--- a/mpn/x86/k7/invert_limb.asm
+++ b/mpn/x86/k7/invert_limb.asm
@@ -60,7 +60,7 @@ ifdef(`DARWIN',`
PROLOGUE(mpn_invert_limb)
deflit(`FRAME', 0)
mov PARAM_DIVISOR, %eax
- C Avoid push/pop on k7.
+ C Avoid push/pop on k7.
sub $8, %esp FRAME_subl_esp(8)
mov %ebx, (%esp)
mov %edi, 4(%esp)
diff --git a/mpn/x86/k7/sublsh1_n.asm b/mpn/x86/k7/sublsh1_n.asm
index 41993f99a..965348586 100644
--- a/mpn/x86/k7/sublsh1_n.asm
+++ b/mpn/x86/k7/sublsh1_n.asm
@@ -30,7 +30,7 @@ C cycles/limb
C P5
C P6 model 0-8,10-12
C P6 model 9 (Banias)
-C P6 model 13 (Dothan)
+C P6 model 13 (Dothan)
C P4 model 0 (Willamette)
C P4 model 1 (?)
C P4 model 2 (Northwood)
@@ -38,12 +38,12 @@ C P4 model 3 (Prescott)
C P4 model 4 (Nocona)
C Intel Atom 6.75
C AMD K6
-C AMD K7
+C AMD K7
C AMD K8
C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
C processors. It uses 2*4-way unrolling, for good reasons.
-C
+C
C Breaking carry recurrency might be a good idea. We would then need separate
C registers for the shift carry and add/subtract carry, which in turn would
C force is to 2*2-way unrolling.
@@ -114,7 +114,7 @@ ifdef(`CPU_P6',`
adc %ebp, %ebp
rcr %edx C restore 1st saved carry bit
-
+
sbb %eax, (rp)
sbb %ebx, 4(rp)
sbb %ecx, 8(rp)
diff --git a/mpn/x86/k8/gmp-mparam.h b/mpn/x86/k8/gmp-mparam.h
new file mode 100644
index 000000000..727a381f1
--- /dev/null
+++ b/mpn/x86/k8/gmp-mparam.h
@@ -0,0 +1,144 @@
+/* x86/k8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 42
+
+#define MUL_TOOM22_THRESHOLD 26
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 136
+#define MUL_TOOM6H_THRESHOLD 286
+#define MUL_TOOM8H_THRESHOLD 430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 46
+#define SQR_TOOM3_THRESHOLD 77
+#define SQR_TOOM4_THRESHOLD 202
+#define SQR_TOOM6_THRESHOLD 294
+#define SQR_TOOM8_THRESHOLD 430
+
+#define MULMID_TOOM42_THRESHOLD 74
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 2,14,216,991,2658
+
+#define MUL_FFT_MODF_THRESHOLD 888 /* k = 6 */
+#define MUL_FFT_TABLE3 \
+ { { 888, 6}, { 15, 5}, { 31, 6}, { 25, 7}, \
+ { 13, 6}, { 27, 7}, { 15, 6}, { 33, 7}, \
+ { 17, 6}, { 35, 7}, { 19, 6}, { 39, 7}, \
+ { 23, 6}, { 47, 7}, { 27, 8}, { 15, 7}, \
+ { 31, 6}, { 63, 7}, { 35, 8}, { 19, 7}, \
+ { 41, 8}, { 23, 7}, { 47, 8}, { 31, 7}, \
+ { 63, 8}, { 39, 7}, { 79, 9}, { 23, 8}, \
+ { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \
+ { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \
+ { 255,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 271, 9}, { 543,10}, \
+ { 287,11}, { 159,10}, { 335,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399, 9}, { 799,11}, \
+ { 223,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 73
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 758 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 758, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 47, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 63, 8}, \
+ { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \
+ { 1087,10}, { 287,11}, { 159,10}, { 319, 9}, \
+ { 671,11}, { 191,10}, { 383, 9}, { 767,10}, \
+ { 399, 9}, { 799,12}, { 4096,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 74
+#define SQR_FFT_THRESHOLD 7296
+
+#define MULLO_BASECASE_THRESHOLD 8
+#define MULLO_DC_THRESHOLD 35
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 91
+#define DC_DIVAPPR_Q_THRESHOLD 278
+#define DC_BDIV_QR_THRESHOLD 87
+#define DC_BDIV_Q_THRESHOLD 216
+
+#define INV_MULMOD_BNM1_THRESHOLD 62
+#define INV_NEWTON_THRESHOLD 262
+#define INV_APPR_THRESHOLD 262
+
+#define BINV_NEWTON_THRESHOLD 278
+#define REDC_1_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 1787
+#define MU_DIVAPPR_Q_THRESHOLD 1718
+#define MUPI_DIV_QR_THRESHOLD 106
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1589
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD_THRESHOLD 139
+#define HGCD_APPR_THRESHOLD 176
+#define HGCD_REDUCE_THRESHOLD 4633
+#define GCD_DC_THRESHOLD 610
+#define GCDEXT_DC_THRESHOLD 419
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 29
+#define SET_STR_DC_THRESHOLD 450
+#define SET_STR_PRECOMPUTE_THRESHOLD 1366
diff --git a/mpn/x86/nano/gmp-mparam.h b/mpn/x86/nano/gmp-mparam.h
new file mode 100644
index 000000000..5fa509372
--- /dev/null
+++ b/mpn/x86/nano/gmp-mparam.h
@@ -0,0 +1,152 @@
+/* x86/nano gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_1P_METHOD 1
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 53
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 32
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 132
+#define MUL_TOOM44_THRESHOLD 195
+#define MUL_TOOM6H_THRESHOLD 270
+#define MUL_TOOM8H_THRESHOLD 478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 130
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 135
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 28
+#define SQR_TOOM3_THRESHOLD 194
+#define SQR_TOOM4_THRESHOLD 502
+#define SQR_TOOM6_THRESHOLD 746
+#define SQR_TOOM8_THRESHOLD 1005
+
+#define MULMID_TOOM42_THRESHOLD 40
+
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define POWM_SEC_TABLE 4,23,258,828,2246
+
+#define MUL_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 308, 5}, { 13, 6}, { 7, 5}, { 17, 6}, \
+ { 9, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \
+ { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \
+ { 19, 7}, { 11, 6}, { 24, 7}, { 15, 6}, \
+ { 31, 7}, { 19, 8}, { 11, 7}, { 25, 8}, \
+ { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 9}, { 15, 8}, { 31, 7}, \
+ { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
+ { 15, 9}, { 31, 8}, { 63, 9}, { 47,10}, \
+ { 31, 9}, { 71,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 543, 9}, \
+ { 287, 8}, { 575, 7}, { 1215,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 543, 8}, { 1087,10}, { 287, 9}, \
+ { 607, 8}, { 1215,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 351, 9}, { 703, 8}, { 1407, 9}, \
+ { 735, 8}, { 1471,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \
+ { 447, 9}, { 895,10}, { 479, 9}, { 959, 8}, \
+ { 1919,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 89
+#define MUL_FFT_THRESHOLD 1856
+
+#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 396, 5}, { 13, 6}, { 7, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 15, 6}, { 31, 7}, { 19, 6}, \
+ { 39, 7}, { 21, 8}, { 11, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
+ { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 543,10}, { 143, 9}, \
+ { 287, 8}, { 607, 7}, { 1215, 6}, { 2431,10}, \
+ { 159, 8}, { 639,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \
+ { 1087,10}, { 287, 9}, { 607, 8}, { 1215,11}, \
+ { 159,10}, { 319, 9}, { 671,10}, { 351, 9}, \
+ { 703, 8}, { 1407, 9}, { 735, 8}, { 1471, 7}, \
+ { 2943,11}, { 191,10}, { 383, 9}, { 799,10}, \
+ { 415, 9}, { 895,10}, { 479,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 87
+#define SQR_FFT_THRESHOLD 2368
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 51
+#define MULLO_MUL_N_THRESHOLD 3369
+
+#define DC_DIV_QR_THRESHOLD 56
+#define DC_DIVAPPR_Q_THRESHOLD 183
+#define DC_BDIV_QR_THRESHOLD 55
+#define DC_BDIV_Q_THRESHOLD 118
+
+#define INV_MULMOD_BNM1_THRESHOLD 30
+#define INV_NEWTON_THRESHOLD 266
+#define INV_APPR_THRESHOLD 218
+
+#define BINV_NEWTON_THRESHOLD 268
+#define REDC_1_TO_REDC_N_THRESHOLD 56
+
+#define MU_DIV_QR_THRESHOLD 1308
+#define MU_DIVAPPR_Q_THRESHOLD 1528
+#define MUPI_DIV_QR_THRESHOLD 124
+#define MU_BDIV_QR_THRESHOLD 855
+#define MU_BDIV_Q_THRESHOLD 1334
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD_THRESHOLD 104
+#define HGCD_APPR_THRESHOLD 139
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 456
+#define GCDEXT_DC_THRESHOLD 321
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 25
+#define SET_STR_DC_THRESHOLD 542
+#define SET_STR_PRECOMPUTE_THRESHOLD 840
diff --git a/mpn/x86/p6/bdiv_q_1.asm b/mpn/x86/p6/bdiv_q_1.asm
index 3a8733a0d..0ffbc78e4 100644
--- a/mpn/x86/p6/bdiv_q_1.asm
+++ b/mpn/x86/p6/bdiv_q_1.asm
@@ -25,7 +25,7 @@ include(`../config.m4')
C odd even divisor
C P6: 10.0 12.0 cycles/limb
-C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
C The odd case is basically the same as mpn_modexact_1_odd, just with an
C extra store, and it runs at the same 10 cycles which is the dependent
@@ -269,7 +269,7 @@ ifdef(`PIC',`
imull %edx, %eax C inv*inv*d
subl %eax, %ebp C inv = 2*inv - inv*inv*d
-
+
jmp L(common)
EPILOGUE()
diff --git a/mpn/x86/p6/sse2/gmp-mparam.h b/mpn/x86/p6/sse2/gmp-mparam.h
index 2735b9c63..b163c58cc 100644
--- a/mpn/x86/p6/sse2/gmp-mparam.h
+++ b/mpn/x86/p6/sse2/gmp-mparam.h
@@ -31,37 +31,42 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
/* 1867 MHz P6 model 13 */
#define MOD_1_NORM_THRESHOLD 4
-#define MOD_1_UNNORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 4
#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 22
+#define BMOD_1_TO_MOD_1_THRESHOLD 21
#define MUL_TOOM22_THRESHOLD 20
-#define MUL_TOOM33_THRESHOLD 77
-#define MUL_TOOM44_THRESHOLD 182
+#define MUL_TOOM33_THRESHOLD 74
+#define MUL_TOOM44_THRESHOLD 181
#define MUL_TOOM6H_THRESHOLD 252
-#define MUL_TOOM8H_THRESHOLD 381
+#define MUL_TOOM8H_THRESHOLD 363
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 79
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 30
#define SQR_TOOM3_THRESHOLD 101
#define SQR_TOOM4_THRESHOLD 154
#define SQR_TOOM6_THRESHOLD 222
-#define SQR_TOOM8_THRESHOLD 547
+#define SQR_TOOM8_THRESHOLD 527
+
+#define MULMID_TOOM42_THRESHOLD 58
#define MULMOD_BNM1_THRESHOLD 13
#define SQRMOD_BNM1_THRESHOLD 17
+#define POWM_SEC_TABLE 4,23,258,768,2388
+
#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
@@ -143,34 +148,36 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 5760
#define MULLO_BASECASE_THRESHOLD 0 /* always */
-#define MULLO_DC_THRESHOLD 34
+#define MULLO_DC_THRESHOLD 33
#define MULLO_MUL_N_THRESHOLD 13463
-#define DC_DIV_QR_THRESHOLD 22
+#define DC_DIV_QR_THRESHOLD 20
#define DC_DIVAPPR_Q_THRESHOLD 56
#define DC_BDIV_QR_THRESHOLD 60
-#define DC_BDIV_Q_THRESHOLD 132
+#define DC_BDIV_Q_THRESHOLD 134
#define INV_MULMOD_BNM1_THRESHOLD 38
-#define INV_NEWTON_THRESHOLD 71
+#define INV_NEWTON_THRESHOLD 66
#define INV_APPR_THRESHOLD 63
-#define BINV_NEWTON_THRESHOLD 252
-#define REDC_1_TO_REDC_N_THRESHOLD 62
+#define BINV_NEWTON_THRESHOLD 250
+#define REDC_1_TO_REDC_N_THRESHOLD 63
-#define MU_DIV_QR_THRESHOLD 1142
-#define MU_DIVAPPR_Q_THRESHOLD 889
-#define MUPI_DIV_QR_THRESHOLD 39
-#define MU_BDIV_QR_THRESHOLD 1308
-#define MU_BDIV_Q_THRESHOLD 1442
+#define MU_DIV_QR_THRESHOLD 1164
+#define MU_DIVAPPR_Q_THRESHOLD 979
+#define MUPI_DIV_QR_THRESHOLD 38
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1470
#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 61
-#define GCD_DC_THRESHOLD 379
-#define GCDEXT_DC_THRESHOLD 298
-#define JACOBI_BASE_METHOD 4
+#define HGCD_THRESHOLD 64
+#define HGCD_APPR_THRESHOLD 105
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 386
+#define GCDEXT_DC_THRESHOLD 309
+#define JACOBI_BASE_METHOD 1
#define GET_STR_DC_THRESHOLD 13
-#define GET_STR_PRECOMPUTE_THRESHOLD 20
-#define SET_STR_DC_THRESHOLD 582
-#define SET_STR_PRECOMPUTE_THRESHOLD 1055
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 587
+#define SET_STR_PRECOMPUTE_THRESHOLD 1104
diff --git a/mpn/x86/pentium/bdiv_q_1.asm b/mpn/x86/pentium/bdiv_q_1.asm
index 965173d1c..7e84fc817 100644
--- a/mpn/x86/pentium/bdiv_q_1.asm
+++ b/mpn/x86/pentium/bdiv_q_1.asm
@@ -27,7 +27,7 @@ C odd even
C P54: 24.5 30.5 cycles/limb
C P55: 23.0 28.0
-MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
C expected. On P54 in the even case the shrdl pairing nonsense (see
diff --git a/mpn/x86/pentium4/sse2/gmp-mparam.h b/mpn/x86/pentium4/sse2/gmp-mparam.h
index b1e56b5e2..8a198ad96 100644
--- a/mpn/x86/pentium4/sse2/gmp-mparam.h
+++ b/mpn/x86/pentium4/sse2/gmp-mparam.h
@@ -22,37 +22,42 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define BYTES_PER_MP_LIMB 4
-#define MOD_1_NORM_THRESHOLD 9
-#define MOD_1_UNNORM_THRESHOLD 20
+#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 6
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 20
#define MUL_TOOM22_THRESHOLD 31
-#define MUL_TOOM33_THRESHOLD 120
-#define MUL_TOOM44_THRESHOLD 286
+#define MUL_TOOM33_THRESHOLD 114
+#define MUL_TOOM44_THRESHOLD 300
#define MUL_TOOM6H_THRESHOLD 426
-#define MUL_TOOM8H_THRESHOLD 592
+#define MUL_TOOM8H_THRESHOLD 620
-#define MUL_TOOM32_TO_TOOM43_THRESHOLD 195
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 216
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 193
-#define MUL_TOOM42_TO_TOOM63_THRESHOLD 187
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 184
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 207
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 181
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 209
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
-#define SQR_TOOM2_THRESHOLD 48
-#define SQR_TOOM3_THRESHOLD 174
-#define SQR_TOOM4_THRESHOLD 390
-#define SQR_TOOM6_THRESHOLD 0
-#define SQR_TOOM8_THRESHOLD 507
+#define SQR_TOOM2_THRESHOLD 49
+#define SQR_TOOM3_THRESHOLD 173
+#define SQR_TOOM4_THRESHOLD 264
+#define SQR_TOOM6_THRESHOLD 354
+#define SQR_TOOM8_THRESHOLD 810
-#define MULMOD_BNM1_THRESHOLD 17
-#define SQRMOD_BNM1_THRESHOLD 21
+#define MULMID_TOOM42_THRESHOLD 68
+
+#define MULMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define POWM_SEC_TABLE 2,33,246,1052,2178
#define MUL_FFT_MODF_THRESHOLD 904 /* k = 6 */
#define MUL_FFT_TABLE3 \
@@ -102,35 +107,37 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 72
#define SQR_FFT_THRESHOLD 6784
-#define MULLO_BASECASE_THRESHOLD 12
-#define MULLO_DC_THRESHOLD 49
-#define MULLO_MUL_N_THRESHOLD 13866
+#define MULLO_BASECASE_THRESHOLD 13
+#define MULLO_DC_THRESHOLD 52
+#define MULLO_MUL_N_THRESHOLD 13463
-#define DC_DIV_QR_THRESHOLD 37
-#define DC_DIVAPPR_Q_THRESHOLD 81
-#define DC_BDIV_QR_THRESHOLD 51
-#define DC_BDIV_Q_THRESHOLD 80
+#define DC_DIV_QR_THRESHOLD 39
+#define DC_DIVAPPR_Q_THRESHOLD 77
+#define DC_BDIV_QR_THRESHOLD 54
+#define DC_BDIV_Q_THRESHOLD 94
#define INV_MULMOD_BNM1_THRESHOLD 60
-#define INV_NEWTON_THRESHOLD 244
-#define INV_APPR_THRESHOLD 98
+#define INV_NEWTON_THRESHOLD 182
+#define INV_APPR_THRESHOLD 93
-#define BINV_NEWTON_THRESHOLD 276
-#define REDC_1_TO_REDC_N_THRESHOLD 63
+#define BINV_NEWTON_THRESHOLD 296
+#define REDC_1_TO_REDC_N_THRESHOLD 66
#define MU_DIV_QR_THRESHOLD 2350
-#define MU_DIVAPPR_Q_THRESHOLD 2172
-#define MUPI_DIV_QR_THRESHOLD 48
-#define MU_BDIV_QR_THRESHOLD 1858
-#define MU_BDIV_Q_THRESHOLD 2172
-
-#define MATRIX22_STRASSEN_THRESHOLD 29
-#define HGCD_THRESHOLD 81
-#define GCD_DC_THRESHOLD 309
+#define MU_DIVAPPR_Q_THRESHOLD 2130
+#define MUPI_DIV_QR_THRESHOLD 71
+#define MU_BDIV_QR_THRESHOLD 2130
+#define MU_BDIV_Q_THRESHOLD 2130
+
+#define MATRIX22_STRASSEN_THRESHOLD 24
+#define HGCD_THRESHOLD 77
+#define HGCD_APPR_THRESHOLD 91
+#define HGCD_REDUCE_THRESHOLD 5010
+#define GCD_DC_THRESHOLD 327
#define GCDEXT_DC_THRESHOLD 253
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 10
-#define GET_STR_PRECOMPUTE_THRESHOLD 25
-#define SET_STR_DC_THRESHOLD 118
-#define SET_STR_PRECOMPUTE_THRESHOLD 1099
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 144
+#define SET_STR_PRECOMPUTE_THRESHOLD 979
diff --git a/mpn/x86/tabselect.asm b/mpn/x86/tabselect.asm
new file mode 100644
index 000000000..7c8c2601f
--- /dev/null
+++ b/mpn/x86/tabselect.asm
@@ -0,0 +1,104 @@
+dnl x86 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) 4.5
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C Intel Atom ?
+C AMD K6 ?
+C AMD K7 3.4
+C AMD K8 ?
+C AMD K10 ?
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using SSE2 could result in many-fold speedup.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `%edi')
+define(`tp', `%esi')
+define(`n', `%ebx')
+define(`nents', `%ecx')
+define(`which', `36(%esp)')
+
+define(`i', `%ebp')
+define(`maskp', `20(%esp)')
+define(`maskn', `32(%esp)')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+ mov 20(%esp), rp
+ mov 24(%esp), tp
+ mov 28(%esp), n
+ mov 32(%esp), nents
+
+ lea (rp,n,4), rp
+ lea (tp,n,4), tp
+ sub nents, which
+L(outer):
+ mov which, %eax
+ add nents, %eax
+ neg %eax C set CF iff 'which' != k
+ sbb %eax, %eax
+ mov %eax, maskn
+ not %eax
+ mov %eax, maskp
+
+ mov n, i
+ neg i
+
+ ALIGN(16)
+L(top): mov (tp,i,4), %eax
+ and maskp, %eax
+ mov (rp,i,4), %edx
+ and maskn, %edx
+ or %edx, %eax
+ mov %eax, (rp,i,4)
+ inc i
+ js L(top)
+
+L(end): mov n, %eax
+ lea (tp,%eax,4), tp
+ dec nents
+ jne L(outer)
+
+L(outer_end):
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
diff --git a/mpn/x86_64/addmul_2.asm b/mpn/x86_64/addmul_2.asm
index 107c3dafe..5c6647888 100644
--- a/mpn/x86_64/addmul_2.asm
+++ b/mpn/x86_64/addmul_2.asm
@@ -50,10 +50,14 @@ define(`w2', `%rbp')
define(`w3', `%r10')
define(`n', `%r11')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
+ASM_START()
TEXT
ALIGN(16)
-ASM_START()
PROLOGUE(mpn_addmul_2)
+ DOS64_ENTRY(4)
mov n_param, n
push %rbx
push %rbp
@@ -164,6 +168,7 @@ L(end): xor R32(w1), R32(w1)
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/aorrlsh1_n.asm b/mpn/x86_64/aorrlsh1_n.asm
index 2ea556b73..dda7d590e 100644
--- a/mpn/x86_64/aorrlsh1_n.asm
+++ b/mpn/x86_64/aorrlsh1_n.asm
@@ -1,7 +1,8 @@
dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
-dnl Copyright 2003, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2006, 2007, 2008, 2009, 2011 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -54,10 +55,14 @@ ifdef(`OPERATION_rsblsh1_n', `
MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
push %rbp
mov (vp), %r8
@@ -147,5 +152,6 @@ ifdef(`OPERATION_rsblsh1_n',`
movslq R32(%rbp), %rax')
pop %rbp
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/aorrlsh2_n.asm b/mpn/x86_64/aorrlsh2_n.asm
index 6d55cfd10..8c427a674 100644
--- a/mpn/x86_64/aorrlsh2_n.asm
+++ b/mpn/x86_64/aorrlsh2_n.asm
@@ -3,7 +3,7 @@ dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh2_n',`
MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/mpn/x86_64/aorrlshC_n.asm b/mpn/x86_64/aorrlshC_n.asm
index cab0b07f4..ae9a9d952 100644
--- a/mpn/x86_64/aorrlshC_n.asm
+++ b/mpn/x86_64/aorrlshC_n.asm
@@ -1,7 +1,7 @@
dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -37,10 +37,14 @@ define(`n', `%rcx')
define(M, eval(m4_lshift(1,LSH)))
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
push %r12
push %r13
push %r14
@@ -140,5 +144,6 @@ ifelse(ADDSUB,add,`
pop %r14
pop %r13
pop %r12
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/aorrlsh_n.asm b/mpn/x86_64/aorrlsh_n.asm
index d19dea535..8ab3688d2 100644
--- a/mpn/x86_64/aorrlsh_n.asm
+++ b/mpn/x86_64/aorrlsh_n.asm
@@ -56,10 +56,23 @@ ifdef(`OPERATION_rsblsh_n',`
MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
push %r12
push %r13
push %r14
@@ -155,5 +168,6 @@ L(end): add R32(%rbx), R32(%rbx)
pop %r14
pop %r13
pop %r12
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/aors_n.asm b/mpn/x86_64/aors_n.asm
index 916e9b664..eadde641b 100644
--- a/mpn/x86_64/aors_n.asm
+++ b/mpn/x86_64/aors_n.asm
@@ -1,7 +1,7 @@
dnl AMD64 mpn_add_n, mpn_sub_n
-dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010 Free Software Foundation,
-dnl Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2011 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -30,15 +30,15 @@ C Intel SBR 1.59
C Intel atom 4
C VIA nano 3.25
-C The inner loop of this code is the result of running a code generation and
+C The loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-define(`cy', `%r8') C (only for mpn_add_nc)
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`vp', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`cy', `%r8') C rsp+40 (only for mpn_add_nc)
ifdef(`OPERATION_add_n', `
define(ADCSBB, adc)
@@ -51,10 +51,23 @@ ifdef(`OPERATION_sub_n', `
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
mov R32(n), R32(%rax)
shr $2, n
and $3, R32(%rax)
@@ -69,6 +82,7 @@ PROLOGUE(func_nc)
EPILOGUE()
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
mov R32(n), R32(%rax)
shr $2, n
and $3, R32(%rax)
@@ -85,6 +99,7 @@ L(lt4): dec R32(%rax)
ADCSBB (vp), %r8
mov %r8, (rp)
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
L(2): dec R32(%rax)
@@ -95,6 +110,7 @@ L(2): dec R32(%rax)
mov %r8, (rp)
mov %r9, 8(rp)
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
L(3): mov 16(up), %r10
@@ -105,6 +121,7 @@ L(3): mov 16(up), %r10
mov %r9, 8(rp)
mov %r10, 16(rp)
setc R8(%rax)
+ DOS64_EXIT()
ret
ALIGN(16)
@@ -142,5 +159,6 @@ L(end): lea 32(up), up
dec R32(%rax)
jnz L(lt4)
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/aorscnd_n.asm b/mpn/x86_64/aorscnd_n.asm
new file mode 100644
index 000000000..d22a2a218
--- /dev/null
+++ b/mpn/x86_64/aorscnd_n.asm
@@ -0,0 +1,178 @@
+dnl AMD64 mpn_addcnd_n, mpn_subcnd_n
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.25
+C AMD K10 2
+C Intel P4 13
+C Intel core2 2.9
+C Intel NHM 2.9
+C Intel SBR 2.4
+C Intel atom 6.5
+C VIA nano 3
+
+C NOTES
+C * It might seem natural to use the cmov insn here, but since this function
+C is supposed to have the exact same execution pattern for cnd true and
+C false, and since cmov's documentation is not clear about wheather it
+C actually reads both source operands and writes the register for a false
+C condition, we cannot use it.
+C * Two cases could be optimised: (1) addcnd_n could use ADCSBB-from-memory
+C to save one insn/limb, and (2) when up=rp addcnd_n and subcnd_n could use
+C ADCSBB-to-memory, again saving 1 insn/limb.
+C * This runs optimally at decoder bandwidth on K10. It has not been tuned
+C for any other processor.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnd', `%r8')
+
+ifdef(`OPERATION_addcnd_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addcnd_n)')
+ifdef(`OPERATION_subcnd_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_subcnd_n)')
+
+MULFUNC_PROLOGUE(mpn_addcnd_n mpn_subcnd_n)
+
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ neg cnd
+ sbb cnd, cnd C make cnd mask
+
+ lea (vp,n,8), vp
+ lea (up,n,8), up
+ lea (rp,n,8), rp
+
+ mov R32(n), R32(%rax)
+ neg n
+ and $3, R32(%rax)
+ jz L(top) C carry-save reg rax = 0 in this arc
+ cmp $2, R32(%rax)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): mov (vp,n,8), %r12
+ mov 8(vp,n,8), %r13
+ mov 16(vp,n,8), %r14
+ mov (up,n,8), %r10
+ mov 8(up,n,8), %rbx
+ mov 16(up,n,8), %rbp
+ and cnd, %r12
+ and cnd, %r13
+ and cnd, %r14
+ ADDSUB %r12, %r10
+ ADCSBB %r13, %rbx
+ ADCSBB %r14, %rbp
+ sbb R32(%rax), R32(%rax) C save carry
+ mov %r10, (rp,n,8)
+ mov %rbx, 8(rp,n,8)
+ mov %rbp, 16(rp,n,8)
+ add $3, n
+ js L(top)
+ jmp L(end)
+
+L(b2): mov (vp,n,8), %r12
+ mov 8(vp,n,8), %r13
+ mov (up,n,8), %r10
+ mov 8(up,n,8), %rbx
+ and cnd, %r12
+ and cnd, %r13
+ ADDSUB %r12, %r10
+ ADCSBB %r13, %rbx
+ sbb R32(%rax), R32(%rax) C save carry
+ mov %r10, (rp,n,8)
+ mov %rbx, 8(rp,n,8)
+ add $2, n
+ js L(top)
+ jmp L(end)
+
+L(b1): mov (vp,n,8), %r12
+ mov (up,n,8), %r10
+ and cnd, %r12
+ ADDSUB %r12, %r10
+ sbb R32(%rax), R32(%rax) C save carry
+ mov %r10, (rp,n,8)
+ add $1, n
+ jns L(end)
+
+ ALIGN(16)
+L(top): mov (vp,n,8), %r12
+ mov 8(vp,n,8), %r13
+ mov 16(vp,n,8), %r14
+ mov 24(vp,n,8), %r11
+ mov (up,n,8), %r10
+ mov 8(up,n,8), %rbx
+ mov 16(up,n,8), %rbp
+ mov 24(up,n,8), %r9
+ and cnd, %r12
+ and cnd, %r13
+ and cnd, %r14
+ and cnd, %r11
+ add R32(%rax), R32(%rax) C restore carry
+ ADCSBB %r12, %r10
+ ADCSBB %r13, %rbx
+ ADCSBB %r14, %rbp
+ ADCSBB %r11, %r9
+ sbb R32(%rax), R32(%rax) C save carry
+ mov %r10, (rp,n,8)
+ mov %rbx, 8(rp,n,8)
+ mov %rbp, 16(rp,n,8)
+ mov %r9, 24(rp,n,8)
+ add $4, n
+ js L(top)
+
+L(end): neg R32(%rax)
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ DOS64_EXIT()
+ ret
+EPILOGUE()
diff --git a/mpn/x86_64/aorsmul_1.asm b/mpn/x86_64/aorsmul_1.asm
index 9c64d56fc..a406bc9e8 100644
--- a/mpn/x86_64/aorsmul_1.asm
+++ b/mpn/x86_64/aorsmul_1.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_addmul_1 and mpn_submul_1.
-dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -28,20 +28,27 @@ C Intel corei ?
C Intel atom 21.3
C VIA nano 5.5
-C The inner loop of this code is the result of running a code generation and
+C The loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
-C TODO:
-C * The inner loop is great, but the prologue and epilogue code was
-C quickly written. Tune it!
+C TODO
+C * The loop is great, but the prologue and epilogue code was quickly written.
+C Tune it!
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param',`%rdx')
-define(`vl', `%rcx')
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vl', `%rcx') C r9
-define(`n', `%r11')
+define(`n', `%r11')
+
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
ifdef(`OPERATION_addmul_1',`
define(`ADDSUB', `add')
@@ -52,17 +59,33 @@ ifdef(`OPERATION_submul_1',`
define(`func', `mpn_submul_1')
')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`vl', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``r11'') ') dnl
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
mov (up), %rax C read first u limb early
push %rbx
- mov n_param, %rbx C move away n from rdx, mul uses it
+IFELF(` mov n_param, %rbx ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %rbx ')
mul vl
- mov %rbx, n
+IFELF(` mov %rbx, n ')
and $3, R32(%rbx)
jz L(b0)
@@ -145,5 +168,7 @@ L(ret): adc $0, %rdx
mov %rdx, %rax
pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
ret
EPILOGUE()
diff --git a/mpn/x86_64/atom/gmp-mparam.h b/mpn/x86_64/atom/gmp-mparam.h
index 37ddcebc2..380f36f25 100644
--- a/mpn/x86_64/atom/gmp-mparam.h
+++ b/mpn/x86_64/atom/gmp-mparam.h
@@ -31,14 +31,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
-#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
-#define BMOD_1_TO_MOD_1_THRESHOLD 17
+#define BMOD_1_TO_MOD_1_THRESHOLD 16
#define MUL_TOOM22_THRESHOLD 10
#define MUL_TOOM33_THRESHOLD 65
@@ -58,9 +59,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_TOOM6_THRESHOLD 222
#define SQR_TOOM8_THRESHOLD 333
+#define MULMID_TOOM42_THRESHOLD 14
+
#define MULMOD_BNM1_THRESHOLD 7
#define SQRMOD_BNM1_THRESHOLD 12
+#define POWM_SEC_TABLE 2,31,213,724,2112
+
#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 220, 5}, { 7, 4}, { 15, 5}, { 13, 6}, \
@@ -145,9 +150,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 748
#define MATRIX22_STRASSEN_THRESHOLD 13
-#define HGCD_THRESHOLD 82
+#define HGCD_THRESHOLD 79
+#define HGCD_APPR_THRESHOLD 83
+#define HGCD_REDUCE_THRESHOLD 1137
#define GCD_DC_THRESHOLD 186
-#define GCDEXT_DC_THRESHOLD 186
+#define GCDEXT_DC_THRESHOLD 189
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 15
diff --git a/mpn/x86_64/bdiv_dbm1c.asm b/mpn/x86_64/bdiv_dbm1c.asm
index f6a77507d..0fef478d9 100644
--- a/mpn/x86_64/bdiv_dbm1c.asm
+++ b/mpn/x86_64/bdiv_dbm1c.asm
@@ -41,10 +41,23 @@ define(`cy', `%r8')
define(`n', `%r9')
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_bdiv_dbm1c)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
mov (up), %rax
mov n_param, n
mov R32(n_param), R32(%r11)
@@ -84,6 +97,7 @@ L(lo1): sub %rax, %r8
add $4, n
jnz L(top)
-L(end): mov %r8, %rax
+ mov %r8, %rax
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/bdiv_q_1.asm b/mpn/x86_64/bdiv_q_1.asm
index 01624a52a..e1e1db5a5 100644
--- a/mpn/x86_64/bdiv_q_1.asm
+++ b/mpn/x86_64/bdiv_q_1.asm
@@ -1,8 +1,8 @@
dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by
dnl 1-limb divisor, returning quotient only.
-dnl Copyright 2001, 2002, 2004, 2005, 2006, 2009 Free Software Foundation,
-dnl Inc.
+dnl Copyright 2001, 2002, 2004, 2005, 2006, 2009, 2011 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -41,10 +41,22 @@ C di r8 just mpn_pi1_bdiv_q_1
C shift r9 just mpn_pi1_bdiv_q_1
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_bdiv_q_1)
+ DOS64_ENTRY(4)
push %rbx
mov %rcx, %rax
@@ -91,6 +103,9 @@ L(evn): bsf %rax, %rcx
EPILOGUE()
PROLOGUE(mpn_pi1_bdiv_q_1)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
push %rbx
mov %rcx, %r11 C d
@@ -144,11 +159,13 @@ L(ent): imul %r8, %rax
imul %r8, %rax
mov %rax, (%rdi)
pop %rbx
+ DOS64_EXIT()
ret
L(one): shr R8(%rcx), %rax
imul %r8, %rax
mov %rax, (%rdi)
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/bobcat/gmp-mparam.h b/mpn/x86_64/bobcat/gmp-mparam.h
index f1edb1d36..5acb78a62 100644
--- a/mpn/x86_64/bobcat/gmp-mparam.h
+++ b/mpn/x86_64/bobcat/gmp-mparam.h
@@ -58,6 +58,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULMOD_BNM1_THRESHOLD 11
#define SQRMOD_BNM1_THRESHOLD 15
+#define POWM_SEC_TABLE 2,23,322,840
+
#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 376, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
@@ -145,9 +147,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 1308
#define MATRIX22_STRASSEN_THRESHOLD 14
-#define HGCD_THRESHOLD 103
-#define GCD_DC_THRESHOLD 469
-#define GCDEXT_DC_THRESHOLD 290
+#define HGCD_THRESHOLD 105
+#define HGCD_APPR_THRESHOLD 113
+#define HGCD_REDUCE_THRESHOLD 2479
+#define GCD_DC_THRESHOLD 330
+#define GCDEXT_DC_THRESHOLD 306
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 17
diff --git a/mpn/x86_64/com.asm b/mpn/x86_64/com.asm
index 6ff62eeac..3a232fc20 100644
--- a/mpn/x86_64/com.asm
+++ b/mpn/x86_64/com.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_com.
-dnl Copyright 2004, 2005, 2006 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2006, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -34,11 +34,14 @@ define(`rp',`%rdi')
define(`up',`%rsi')
define(`n',`%rdx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_com)
+ DOS64_ENTRY(3)
movq (up), %r8
movl R32(%rdx), R32(%rax)
leaq (up,n,8), up
@@ -76,5 +79,6 @@ L(e10): movq 24(up,n,8), %r9
movq %r9, 24(rp,n,8)
addq $4, n
jnc L(oop)
-L(ret): ret
+L(ret): DOS64_EXIT()
+ ret
EPILOGUE()
diff --git a/mpn/x86_64/copyd.asm b/mpn/x86_64/copyd.asm
index 13210217b..15e929f4e 100644
--- a/mpn/x86_64/copyd.asm
+++ b/mpn/x86_64/copyd.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_copyd -- copy limb vector, decrementing.
-dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2007, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -39,10 +39,14 @@ define(`rp',`%rdi')
define(`up',`%rsi')
define(`n',`%rdx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_copyd)
+ DOS64_ENTRY(3)
leaq -8(up,n,8), up
leaq (rp,n,8), rp
subq $4, n
@@ -73,5 +77,6 @@ L(end): shrl R32(%rdx) C edx = lowpart(n)
movq -8(up), %r9
movq %r8, -8(rp)
movq %r9, -16(rp)
-1: ret
+1: DOS64_EXIT()
+ ret
EPILOGUE()
diff --git a/mpn/x86_64/copyi.asm b/mpn/x86_64/copyi.asm
index d5cbdd644..1dd6c3168 100644
--- a/mpn/x86_64/copyi.asm
+++ b/mpn/x86_64/copyi.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_copyi -- copy limb vector, incrementing.
-dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2007, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -39,10 +39,14 @@ define(`rp',`%rdi')
define(`up',`%rsi')
define(`n',`%rdx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_copyi)
+ DOS64_ENTRY(3)
leaq -8(rp), rp
subq $4, n
jc L(end)
@@ -72,5 +76,6 @@ L(end): shrl R32(%rdx) C edx = lowpart(n)
movq 8(up), %r9
movq %r8, 8(rp)
movq %r9, 16(rp)
-1: ret
+1: DOS64_EXIT()
+ ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/aorrlsh1_n.asm b/mpn/x86_64/core2/aorrlsh1_n.asm
index 346c21f33..e44e718a6 100644
--- a/mpn/x86_64/core2/aorrlsh1_n.asm
+++ b/mpn/x86_64/core2/aorrlsh1_n.asm
@@ -3,7 +3,7 @@ dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh1_n', `
MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/mpn/x86_64/core2/aorrlsh2_n.asm b/mpn/x86_64/core2/aorrlsh2_n.asm
index 1da0c527f..2d9c89553 100644
--- a/mpn/x86_64/core2/aorrlsh2_n.asm
+++ b/mpn/x86_64/core2/aorrlsh2_n.asm
@@ -3,7 +3,7 @@ dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -36,4 +36,7 @@ ifdef(`OPERATION_rsblsh2_n', `
MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/mpn/x86_64/core2/aorrlsh_n.asm b/mpn/x86_64/core2/aorrlsh_n.asm
index 8d03970ca..a8f5c051a 100644
--- a/mpn/x86_64/core2/aorrlsh_n.asm
+++ b/mpn/x86_64/core2/aorrlsh_n.asm
@@ -20,4 +20,8 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/coreinhm/aorrlsh_n.asm')
diff --git a/mpn/x86_64/core2/aors_n.asm b/mpn/x86_64/core2/aors_n.asm
index 75807c79a..bc109cc22 100644
--- a/mpn/x86_64/core2/aors_n.asm
+++ b/mpn/x86_64/core2/aors_n.asm
@@ -1,6 +1,6 @@
dnl Intel P6-15 mpn_add_n/mpn_sub_n -- mpn add or subtract.
-dnl Copyright 2006, 2007 Free Software Foundation, Inc.
+dnl Copyright 2006, 2007, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -48,16 +48,28 @@ ifdef(`OPERATION_sub_n', `
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-ASM_START()
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+ASM_START()
TEXT
ALIGN(16)
-
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
jmp L(start)
EPILOGUE()
PROLOGUE(func)
+ DOS64_ENTRY(4)
xor %r8, %r8
L(start):
mov (up), %r10
@@ -96,6 +108,7 @@ L(end): ADCSBB %r11, %r10
mov %r10, 8(rp)
mov R32(%rcx), R32(%rax) C clear eax, ecx contains 0
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
ALIGN(16)
diff --git a/mpn/x86_64/core2/aorsmul_1.asm b/mpn/x86_64/core2/aorsmul_1.asm
index bb4f663c4..aeda30159 100644
--- a/mpn/x86_64/core2/aorsmul_1.asm
+++ b/mpn/x86_64/core2/aorsmul_1.asm
@@ -1,6 +1,7 @@
dnl x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
-dnl Copyright 2003, 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2009, 2011 Free Software
+dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -45,10 +46,14 @@ ifdef(`OPERATION_submul_1',`
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
push %rbx
push %rbp
lea (%rdx), %rbx
@@ -127,5 +132,6 @@ L(n1): mov 8(rp), %r10
adc %rdx, %rax
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/gmp-mparam.h b/mpn/x86_64/core2/gmp-mparam.h
index 43adaa078..0752688fd 100644
--- a/mpn/x86_64/core2/gmp-mparam.h
+++ b/mpn/x86_64/core2/gmp-mparam.h
@@ -31,14 +31,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 26
#define MUL_TOOM22_THRESHOLD 23
#define MUL_TOOM33_THRESHOLD 65
-#define MUL_TOOM44_THRESHOLD 178
-#define MUL_TOOM6H_THRESHOLD 222
-#define MUL_TOOM8H_THRESHOLD 0
+#define MUL_TOOM44_THRESHOLD 169
+#define MUL_TOOM6H_THRESHOLD 254
+#define MUL_TOOM8H_THRESHOLD 357
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 69
#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
@@ -48,15 +49,17 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 26
#define SQR_TOOM3_THRESHOLD 85
-#define SQR_TOOM4_THRESHOLD 160
-#define SQR_TOOM6_THRESHOLD 218
-#define SQR_TOOM8_THRESHOLD 296
+#define SQR_TOOM4_THRESHOLD 226
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 454
#define MULMID_TOOM42_THRESHOLD 24
#define MULMOD_BNM1_THRESHOLD 15
#define SQRMOD_BNM1_THRESHOLD 15
+#define POWM_SEC_TABLE 2,41,322,840,1100,1556
+
#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 380, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
@@ -156,8 +159,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_THRESHOLD 2752
#define MULLO_BASECASE_THRESHOLD 3
-#define MULLO_DC_THRESHOLD 20
-#define MULLO_MUL_N_THRESHOLD 10950
+#define MULLO_DC_THRESHOLD 18
+#define MULLO_MUL_N_THRESHOLD 9174
#define DC_DIV_QR_THRESHOLD 47
#define DC_DIVAPPR_Q_THRESHOLD 179
@@ -180,11 +183,13 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MATRIX22_STRASSEN_THRESHOLD 18
#define HGCD_THRESHOLD 135
+#define HGCD_APPR_THRESHOLD 169
+#define HGCD_REDUCE_THRESHOLD 2121
#define GCD_DC_THRESHOLD 330
#define GCDEXT_DC_THRESHOLD 361
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 13
#define GET_STR_PRECOMPUTE_THRESHOLD 23
-#define SET_STR_DC_THRESHOLD 746
+#define SET_STR_DC_THRESHOLD 552
#define SET_STR_PRECOMPUTE_THRESHOLD 1893
diff --git a/mpn/x86_64/core2/lshift.asm b/mpn/x86_64/core2/lshift.asm
index 3b17e8315..2e175de76 100644
--- a/mpn/x86_64/core2/lshift.asm
+++ b/mpn/x86_64/core2/lshift.asm
@@ -1,6 +1,6 @@
dnl x86-64 mpn_lshift optimized for "Core 2".
-dnl Copyright 2007, 2009 Free Software Foundation, Inc.
+dnl Copyright 2007, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -35,12 +35,16 @@ C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n', `%rdx')
-define(`cnt', `%cl')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_lshift)
+ DOS64_ENTRY(4)
lea -8(rp,n,8), rp
lea -8(up,n,8), up
@@ -51,7 +55,7 @@ L(b00): C n = 4, 8, 12, ...
mov (up), %r10
mov -8(up), %r11
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r10, %rax
+ shld R8(cnt), %r10, %rax
mov -16(up), %r8
lea 24(rp), rp
sub $4, n
@@ -62,7 +66,7 @@ L(nb00):C n = 1, 5, 9, ...
jae L(nb01)
L(b01): mov (up), %r9
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r9, %rax
+ shld R8(cnt), %r9, %rax
sub $2, n
jb L(le1)
mov -8(up), %r10
@@ -70,8 +74,9 @@ L(b01): mov (up), %r9
lea -8(up), up
lea 16(rp), rp
jmp L(01)
-L(le1): shl R8(%rcx), %r9
+L(le1): shl R8(cnt), %r9
mov %r9, (rp)
+ DOS64_EXIT()
ret
L(nb01):C n = 2, 6, 10, ...
@@ -79,17 +84,18 @@ L(nb01):C n = 2, 6, 10, ...
L(b10): mov (up), %r8
mov -8(up), %r9
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r8, %rax
+ shld R8(cnt), %r8, %rax
sub $3, n
jb L(le2)
mov -16(up), %r10
lea -16(up), up
lea 8(rp), rp
jmp L(10)
-L(le2): shld R8(%rcx), %r9, %r8
+L(le2): shld R8(cnt), %r9, %r8
mov %r8, (rp)
- shl R8(%rcx), %r9
+ shl R8(cnt), %r9
mov %r9, -8(rp)
+ DOS64_EXIT()
ret
ALIGN(16) C performance critical!
@@ -97,23 +103,23 @@ L(b11): C n = 3, 7, 11, ...
mov (up), %r11
mov -8(up), %r8
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r11, %rax
+ shld R8(cnt), %r11, %rax
mov -16(up), %r9
lea -24(up), up
sub $4, n
jb L(end)
ALIGN(16)
-L(top): shld R8(%rcx), %r8, %r11
+L(top): shld R8(cnt), %r8, %r11
mov (up), %r10
mov %r11, (rp)
-L(10): shld R8(%rcx), %r9, %r8
+L(10): shld R8(cnt), %r9, %r8
mov -8(up), %r11
mov %r8, -8(rp)
-L(01): shld R8(%rcx), %r10, %r9
+L(01): shld R8(cnt), %r10, %r9
mov -16(up), %r8
mov %r9, -16(rp)
-L(00): shld R8(%rcx), %r11, %r10
+L(00): shld R8(cnt), %r11, %r10
mov -24(up), %r9
mov %r10, -24(rp)
add $-32, up
@@ -121,11 +127,12 @@ L(00): shld R8(%rcx), %r11, %r10
sub $4, n
jnc L(top)
-L(end): shld R8(%rcx), %r8, %r11
+L(end): shld R8(cnt), %r8, %r11
mov %r11, (rp)
- shld R8(%rcx), %r9, %r8
+ shld R8(cnt), %r9, %r8
mov %r8, -8(rp)
- shl R8(%rcx), %r9
+ shl R8(cnt), %r9
mov %r9, -16(rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/lshiftc.asm b/mpn/x86_64/core2/lshiftc.asm
index a19f72297..31a08f7ae 100644
--- a/mpn/x86_64/core2/lshiftc.asm
+++ b/mpn/x86_64/core2/lshiftc.asm
@@ -1,6 +1,6 @@
dnl x86-64 mpn_lshiftc optimized for "Core 2".
-dnl Copyright 2007, 2009 Free Software Foundation, Inc.
+dnl Copyright 2007, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -35,12 +35,16 @@ C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n', `%rdx')
-define(`cnt', `%cl')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_lshiftc)
+ DOS64_ENTRY(4)
lea -8(rp,n,8), rp
lea -8(up,n,8), up
@@ -51,7 +55,7 @@ L(b00): C n = 4, 8, 12, ...
mov (up), %r10
mov -8(up), %r11
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r10, %rax
+ shld R8(cnt), %r10, %rax
mov -16(up), %r8
lea 24(rp), rp
sub $4, n
@@ -62,7 +66,7 @@ L(nb00):C n = 1, 5, 9, ...
jae L(nb01)
L(b01): mov (up), %r9
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r9, %rax
+ shld R8(cnt), %r9, %rax
sub $2, n
jb L(le1)
mov -8(up), %r10
@@ -70,9 +74,10 @@ L(b01): mov (up), %r9
lea -8(up), up
lea 16(rp), rp
jmp L(01)
-L(le1): shl R8(%rcx), %r9
+L(le1): shl R8(cnt), %r9
not %r9
mov %r9, (rp)
+ DOS64_EXIT()
ret
L(nb01):C n = 2, 6, 10, ...
@@ -80,19 +85,20 @@ L(nb01):C n = 2, 6, 10, ...
L(b10): mov (up), %r8
mov -8(up), %r9
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r8, %rax
+ shld R8(cnt), %r8, %rax
sub $3, n
jb L(le2)
mov -16(up), %r10
lea -16(up), up
lea 8(rp), rp
jmp L(10)
-L(le2): shld R8(%rcx), %r9, %r8
+L(le2): shld R8(cnt), %r9, %r8
not %r8
mov %r8, (rp)
- shl R8(%rcx), %r9
+ shl R8(cnt), %r9
not %r9
mov %r9, -8(rp)
+ DOS64_EXIT()
ret
ALIGN(16) C performance critical!
@@ -100,26 +106,26 @@ L(b11): C n = 3, 7, 11, ...
mov (up), %r11
mov -8(up), %r8
xor R32(%rax), R32(%rax)
- shld R8(%rcx), %r11, %rax
+ shld R8(cnt), %r11, %rax
mov -16(up), %r9
lea -24(up), up
sub $4, n
jb L(end)
ALIGN(16)
-L(top): shld R8(%rcx), %r8, %r11
+L(top): shld R8(cnt), %r8, %r11
mov (up), %r10
not %r11
mov %r11, (rp)
-L(10): shld R8(%rcx), %r9, %r8
+L(10): shld R8(cnt), %r9, %r8
mov -8(up), %r11
not %r8
mov %r8, -8(rp)
-L(01): shld R8(%rcx), %r10, %r9
+L(01): shld R8(cnt), %r10, %r9
mov -16(up), %r8
not %r9
mov %r9, -16(rp)
-L(00): shld R8(%rcx), %r11, %r10
+L(00): shld R8(cnt), %r11, %r10
mov -24(up), %r9
not %r10
mov %r10, -24(rp)
@@ -128,14 +134,15 @@ L(00): shld R8(%rcx), %r11, %r10
sub $4, n
jnc L(top)
-L(end): shld R8(%rcx), %r8, %r11
+L(end): shld R8(cnt), %r8, %r11
not %r11
mov %r11, (rp)
- shld R8(%rcx), %r9, %r8
+ shld R8(cnt), %r9, %r8
not %r8
mov %r8, -8(rp)
- shl R8(%rcx), %r9
+ shl R8(cnt), %r9
not %r9
mov %r9, -16(rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/rsh1aors_n.asm b/mpn/x86_64/core2/rsh1aors_n.asm
index eb52efc08..b350e4a43 100644
--- a/mpn/x86_64/core2/rsh1aors_n.asm
+++ b/mpn/x86_64/core2/rsh1aors_n.asm
@@ -1,6 +1,6 @@
dnl Intel P6/64 mpn_rsh1add_n and mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1
-dnl Copyright 2003, 2005, 2009, 2010 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2009, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -49,11 +49,24 @@ ifdef(`OPERATION_rsh1sub_n', `
MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
push %rbx
push %rbp
@@ -66,6 +79,7 @@ EPILOGUE()
ALIGN(16)
PROLOGUE(func_n)
+ DOS64_ENTRY(4)
push %rbx
push %rbp
@@ -171,5 +185,6 @@ L(end): shrd $1, %rbx, %rbp
mov %rbp, (rp)
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/rshift.asm b/mpn/x86_64/core2/rshift.asm
index 38a77364f..68306881c 100644
--- a/mpn/x86_64/core2/rshift.asm
+++ b/mpn/x86_64/core2/rshift.asm
@@ -1,6 +1,6 @@
dnl x86-64 mpn_rshift optimized for "Core 2".
-dnl Copyright 2007, 2009 Free Software Foundation, Inc.
+dnl Copyright 2007, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -35,12 +35,16 @@ C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n', `%rdx')
-define(`cnt', `%cl')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_rshift)
+ DOS64_ENTRY(4)
mov R32(%rdx), R32(%rax)
and $3, R32(%rax)
jne L(nb00)
@@ -48,7 +52,7 @@ L(b00): C n = 4, 8, 12, ...
mov (up), %r10
mov 8(up), %r11
xor R32(%rax), R32(%rax)
- shrd R8(%rcx), %r10, %rax
+ shrd R8(cnt), %r10, %rax
mov 16(up), %r8
lea 8(up), up
lea -24(rp), rp
@@ -60,7 +64,7 @@ L(nb00):C n = 1, 5, 9, ...
jae L(nb01)
L(b01): mov (up), %r9
xor R32(%rax), R32(%rax)
- shrd R8(%rcx), %r9, %rax
+ shrd R8(cnt), %r9, %rax
sub $2, n
jb L(le1)
mov 8(up), %r10
@@ -68,8 +72,9 @@ L(b01): mov (up), %r9
lea 16(up), up
lea -16(rp), rp
jmp L(01)
-L(le1): shr R8(%rcx), %r9
+L(le1): shr R8(cnt), %r9
mov %r9, (rp)
+ DOS64_EXIT()
ret
L(nb01):C n = 2, 6, 10, ...
@@ -77,17 +82,18 @@ L(nb01):C n = 2, 6, 10, ...
L(b10): mov (up), %r8
mov 8(up), %r9
xor R32(%rax), R32(%rax)
- shrd R8(%rcx), %r8, %rax
+ shrd R8(cnt), %r8, %rax
sub $3, n
jb L(le2)
mov 16(up), %r10
lea 24(up), up
lea -8(rp), rp
jmp L(10)
-L(le2): shrd R8(%rcx), %r9, %r8
+L(le2): shrd R8(cnt), %r9, %r8
mov %r8, (rp)
- shr R8(%rcx), %r9
+ shr R8(cnt), %r9
mov %r9, 8(rp)
+ DOS64_EXIT()
ret
ALIGN(16)
@@ -95,23 +101,23 @@ L(b11): C n = 3, 7, 11, ...
mov (up), %r11
mov 8(up), %r8
xor R32(%rax), R32(%rax)
- shrd R8(%rcx), %r11, %rax
+ shrd R8(cnt), %r11, %rax
mov 16(up), %r9
lea 32(up), up
sub $4, n
jb L(end)
ALIGN(16)
-L(top): shrd R8(%rcx), %r8, %r11
+L(top): shrd R8(cnt), %r8, %r11
mov -8(up), %r10
mov %r11, (rp)
-L(10): shrd R8(%rcx), %r9, %r8
+L(10): shrd R8(cnt), %r9, %r8
mov (up), %r11
mov %r8, 8(rp)
-L(01): shrd R8(%rcx), %r10, %r9
+L(01): shrd R8(cnt), %r10, %r9
mov 8(up), %r8
mov %r9, 16(rp)
-L(00): shrd R8(%rcx), %r11, %r10
+L(00): shrd R8(cnt), %r11, %r10
mov 16(up), %r9
mov %r10, 24(rp)
add $32, up
@@ -119,11 +125,12 @@ L(00): shrd R8(%rcx), %r11, %r10
sub $4, n
jnc L(top)
-L(end): shrd R8(%rcx), %r8, %r11
+L(end): shrd R8(cnt), %r8, %r11
mov %r11, (rp)
- shrd R8(%rcx), %r9, %r8
+ shrd R8(cnt), %r9, %r8
mov %r8, 8(rp)
- shr R8(%rcx), %r9
+ shr R8(cnt), %r9
mov %r9, 16(rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/core2/sublsh1_n.asm b/mpn/x86_64/core2/sublsh1_n.asm
index 7522b429f..50411d7d0 100644
--- a/mpn/x86_64/core2/sublsh1_n.asm
+++ b/mpn/x86_64/core2/sublsh1_n.asm
@@ -2,7 +2,7 @@ dnl AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN.
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -30,4 +30,7 @@ define(func, mpn_sublsh1_n)
MULFUNC_PROLOGUE(mpn_sublsh1_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/mpn/x86_64/core2/sublsh2_n.asm b/mpn/x86_64/core2/sublsh2_n.asm
index 036d2c859..affc87177 100644
--- a/mpn/x86_64/core2/sublsh2_n.asm
+++ b/mpn/x86_64/core2/sublsh2_n.asm
@@ -2,7 +2,7 @@ dnl AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN.
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -30,4 +30,7 @@ define(func, mpn_sublsh2_n)
MULFUNC_PROLOGUE(mpn_sublsh2_n)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/mpn/x86_64/core2/sublshC_n.asm b/mpn/x86_64/core2/sublshC_n.asm
index 2f89c35e3..7c4545f5a 100644
--- a/mpn/x86_64/core2/sublshC_n.asm
+++ b/mpn/x86_64/core2/sublshC_n.asm
@@ -3,7 +3,7 @@ dnl Core iN.
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -40,6 +40,7 @@ ASM_START()
TEXT
ALIGN(8)
PROLOGUE(func)
+ DOS64_ENTRY(4)
push %rbx
push %r12
@@ -141,5 +142,6 @@ L(end): shr $RSH, %r11
pop %rbx
sub R32(%r11), R32(%rax)
neg R32(%rax)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/coreinhm/aorrlsh_n.asm b/mpn/x86_64/coreinhm/aorrlsh_n.asm
index a4afae69d..e22cc065d 100644
--- a/mpn/x86_64/coreinhm/aorrlsh_n.asm
+++ b/mpn/x86_64/coreinhm/aorrlsh_n.asm
@@ -62,10 +62,23 @@ C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
C refmpn_rsblsh_nc
MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(func_n)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
push %rbx
xor R32(%rbx), R32(%rbx) C clear CF save register
L(ent): push %rbp
@@ -170,9 +183,13 @@ L(wd1): shrd %cl, %r8, %r11
IFRSB( neg %rax)
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
+IFDOS(` mov 64(%rsp), %r9 ') C cy
push %rbx
neg cy
sbb R32(%rbx), R32(%rbx) C initialise CF save register
diff --git a/mpn/x86_64/coreinhm/gmp-mparam.h b/mpn/x86_64/coreinhm/gmp-mparam.h
index eec17787d..0a0ada3c5 100644
--- a/mpn/x86_64/coreinhm/gmp-mparam.h
+++ b/mpn/x86_64/coreinhm/gmp-mparam.h
@@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 15
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 17
@@ -52,56 +53,92 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_TOOM6_THRESHOLD 318
#define SQR_TOOM8_THRESHOLD 502
+#define MULMID_TOOM42_THRESHOLD 22
+
#define MULMOD_BNM1_THRESHOLD 13
#define SQRMOD_BNM1_THRESHOLD 13
+#define POWM_SEC_TABLE 3,42,83,643,2080
+
#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 380, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
- { 23, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
{ 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
{ 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
{ 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
{ 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
{ 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
{ 31,10}, { 79,11}, { 47,10}, { 95,12}, \
{ 31,11}, { 63,10}, { 135,11}, { 79,10}, \
- { 159, 9}, { 319, 8}, { 639,10}, { 167,11}, \
- { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
- { 575,11}, { 159,10}, { 319,12}, { 95,11}, \
- { 191,10}, { 383,11}, { 207,13}, { 8192,14}, \
- { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
- { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
- {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 74
+ { 159,11}, { 95,10}, { 191, 9}, { 383,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,10}, { 895,13}, \
+ { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
+ { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
+ { 639,12}, { 351,11}, { 703,10}, { 1407,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \
+ { 1151,12}, { 607,13}, { 319,12}, { 703,11}, \
+ { 1407,13}, { 383,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 959,11}, { 1919,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 137
#define MUL_FFT_THRESHOLD 3712
-#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD 304 /* k = 5 */
#define SQR_FFT_TABLE3 \
- { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { { 304, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
{ 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
{ 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
{ 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
{ 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
{ 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 43,10}, { 23, 9}, { 47,11}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
{ 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
- { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
- { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
- { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
- { 511,10}, { 135,11}, { 79,10}, { 159, 9}, \
- { 319,11}, { 95,10}, { 191, 9}, { 383, 8}, \
- { 767,12}, { 63,10}, { 255,11}, { 143, 9}, \
- { 575, 8}, { 1151,11}, { 159,10}, { 319, 9}, \
- { 639,11}, { 175,12}, { 95,11}, { 191,10}, \
- { 383,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 76
-#define SQR_FFT_THRESHOLD 3200
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,12}, { 31,11}, { 63,10}, { 127, 9}, \
+ { 255,11}, { 79,10}, { 159, 9}, { 319,11}, \
+ { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,11}, { 143,10}, { 287, 9}, { 575,11}, \
+ { 159,10}, { 319,11}, { 175,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,10}, { 895,11}, \
+ { 479,13}, { 127,12}, { 255,11}, { 511,10}, \
+ { 1023,11}, { 543,12}, { 287,11}, { 575,10}, \
+ { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,11}, { 895,12}, \
+ { 479,11}, { 959,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 767,11}, { 1535,12}, { 831,13}, \
+ { 447,12}, { 959,11}, { 1919,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 137
+#define SQR_FFT_THRESHOLD 2752
#define MULLO_BASECASE_THRESHOLD 4
#define MULLO_DC_THRESHOLD 21
@@ -112,8 +149,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DC_BDIV_QR_THRESHOLD 32
#define DC_BDIV_Q_THRESHOLD 70
-#define INV_MULMOD_BNM1_THRESHOLD 46
-#define INV_NEWTON_THRESHOLD 195
+#define INV_MULMOD_BNM1_THRESHOLD 34
+#define INV_NEWTON_THRESHOLD 177
#define INV_APPR_THRESHOLD 147
#define BINV_NEWTON_THRESHOLD 252
@@ -126,13 +163,15 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_QR_THRESHOLD 1120
#define MU_BDIV_Q_THRESHOLD 1187
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 117
-#define GCD_DC_THRESHOLD 330
-#define GCDEXT_DC_THRESHOLD 382
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 126
+#define HGCD_APPR_THRESHOLD 171
+#define HGCD_REDUCE_THRESHOLD 2205
+#define GCD_DC_THRESHOLD 345
+#define GCDEXT_DC_THRESHOLD 386
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_DC_THRESHOLD 15
#define GET_STR_PRECOMPUTE_THRESHOLD 20
-#define SET_STR_DC_THRESHOLD 552
-#define SET_STR_PRECOMPUTE_THRESHOLD 1655
+#define SET_STR_DC_THRESHOLD 232
+#define SET_STR_PRECOMPUTE_THRESHOLD 1585
diff --git a/mpn/x86_64/coreisbr/aors_n.asm b/mpn/x86_64/coreisbr/aors_n.asm
index 66a5e3b60..4d8d1cccf 100644
--- a/mpn/x86_64/coreisbr/aors_n.asm
+++ b/mpn/x86_64/coreisbr/aors_n.asm
@@ -49,10 +49,22 @@ ifdef(`OPERATION_sub_n', `
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func)
+ DOS64_ENTRY(4)
xor %r8, %r8
L(ent): mov R32(n), R32(%rax)
shr $2, n
@@ -144,5 +156,7 @@ L(e1): ADCSBB 16(vp), %r10
ret
EPILOGUE()
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
jmp L(ent)
EPILOGUE()
diff --git a/mpn/x86_64/coreisbr/gmp-mparam.h b/mpn/x86_64/coreisbr/gmp-mparam.h
index e4727116b..c30c64ec8 100644
--- a/mpn/x86_64/coreisbr/gmp-mparam.h
+++ b/mpn/x86_64/coreisbr/gmp-mparam.h
@@ -29,8 +29,9 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 6
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 30
@@ -52,58 +53,123 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_TOOM6_THRESHOLD 0
#define SQR_TOOM8_THRESHOLD 458
-#define MULMOD_BNM1_THRESHOLD 11
-#define SQRMOD_BNM1_THRESHOLD 16
+#define MULMID_TOOM42_THRESHOLD 24
-#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 14
+
+#define POWM_SEC_TABLE 4,35,130,713,2080
+
+#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
#define MUL_FFT_TABLE3 \
- { { 376, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
- { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
- { 21, 7}, { 11, 6}, { 23, 7}, { 13, 6}, \
- { 27, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { { 380, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \
{ 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
{ 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
{ 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
{ 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
{ 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \
{ 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
- { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
+ { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
{ 95,10}, { 55,11}, { 31,10}, { 79,11}, \
{ 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 135,11}, { 79,10}, { 167,11}, { 95,10}, \
- { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \
- { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \
- { 287, 9}, { 575,11}, { 159,10}, { 319,12}, \
- { 95,11}, { 191,10}, { 383,11}, { 207,13}, \
- { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 135,11}, { 79,10}, { 159, 9}, { 319,10}, \
+ { 167,11}, { 95,10}, { 191, 9}, { 383,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \
+ { 303,11}, { 159,10}, { 319,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 351,10}, { 703, 9}, { 1407,11}, \
+ { 367,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,12}, { 223,11}, { 447,10}, \
+ { 895,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 639,12}, { 351,11}, \
+ { 703,10}, { 1407,11}, { 735,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,10}, \
+ { 1663,12}, { 447,11}, { 895,14}, { 127,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
+ { 1087,12}, { 575,11}, { 1151,12}, { 607,11}, \
+ { 1215,13}, { 319,12}, { 639,11}, { 1279,12}, \
+ { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \
+ { 1535,12}, { 831,11}, { 1663,13}, { 447,12}, \
+ { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
+ { 1087,13}, { 575,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 959,12}, \
+ { 1919,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
+ { 1407,13}, { 2815,15}, { 767,14}, { 1663,13}, \
+ { 3455,14}, { 1919,13}, { 3839,16}, { 511,15}, \
+ { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2943,13}, { 5887,15}, { 32768,16}, { 65536,17}, \
{ 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
{2097152,22}, {4194304,23}, {8388608,24} }
-#define MUL_FFT_TABLE3_SIZE 83
-#define MUL_FFT_THRESHOLD 3712
+#define MUL_FFT_TABLE3_SIZE 203
+#define MUL_FFT_THRESHOLD 4736
-#define SQR_FFT_MODF_THRESHOLD 316 /* k = 5 */
+#define SQR_FFT_MODF_THRESHOLD 304 /* k = 5 */
#define SQR_FFT_TABLE3 \
- { { 316, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { { 304, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
{ 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
- { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
{ 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
- { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
{ 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
- { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
- { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 63,10}, { 39, 9}, \
{ 79,10}, { 47,11}, { 31,10}, { 79,11}, \
- { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
- { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \
- { 319,11}, { 95,10}, { 191, 9}, { 383,12}, \
- { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
- { 271, 9}, { 543,11}, { 143,10}, { 287, 9}, \
- { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \
- { 639,12}, { 95,11}, { 191,10}, { 383,11}, \
- { 207,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
- { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
- {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
-#define SQR_FFT_TABLE3_SIZE 76
-#define SQR_FFT_THRESHOLD 3264
+ { 47,12}, { 31,11}, { 63,10}, { 127, 9}, \
+ { 255, 8}, { 511,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
+ { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
+ { 639,12}, { 95,11}, { 191,10}, { 383, 9}, \
+ { 767,11}, { 207,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,10}, { 895,11}, \
+ { 479,10}, { 959,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 639,10}, { 1279,12}, { 351,11}, { 703,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,12}, { 447,11}, { 895,12}, { 479,11}, \
+ { 959,10}, { 1919,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 767,11}, { 1535,12}, { 831,13}, \
+ { 447,12}, { 959,11}, { 1919,14}, { 255,13}, \
+ { 511,12}, { 1087,13}, { 575,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1279,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,12}, { 1919,15}, \
+ { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
+ { 1407,13}, { 2815,15}, { 767,14}, { 1663,13}, \
+ { 3455,14}, { 1919,16}, { 511,15}, { 1023,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 198
+#define SQR_FFT_THRESHOLD 2752
#define MULLO_BASECASE_THRESHOLD 5
#define MULLO_DC_THRESHOLD 33
@@ -114,27 +180,29 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define DC_BDIV_QR_THRESHOLD 31
#define DC_BDIV_Q_THRESHOLD 71
-#define INV_MULMOD_BNM1_THRESHOLD 38
-#define INV_NEWTON_THRESHOLD 127
-#define INV_APPR_THRESHOLD 123
+#define INV_MULMOD_BNM1_THRESHOLD 50
+#define INV_NEWTON_THRESHOLD 123
+#define INV_APPR_THRESHOLD 122
-#define BINV_NEWTON_THRESHOLD 181
-#define REDC_1_TO_REDC_2_THRESHOLD 17
-#define REDC_2_TO_REDC_N_THRESHOLD 51
+#define BINV_NEWTON_THRESHOLD 197
+#define REDC_1_TO_REDC_2_THRESHOLD 20
+#define REDC_2_TO_REDC_N_THRESHOLD 54
#define MU_DIV_QR_THRESHOLD 1334
#define MU_DIVAPPR_Q_THRESHOLD 1387
-#define MUPI_DIV_QR_THRESHOLD 57
+#define MUPI_DIV_QR_THRESHOLD 46
#define MU_BDIV_QR_THRESHOLD 1142
#define MU_BDIV_Q_THRESHOLD 1308
#define MATRIX22_STRASSEN_THRESHOLD 15
-#define HGCD_THRESHOLD 90
-#define GCD_DC_THRESHOLD 400
-#define GCDEXT_DC_THRESHOLD 372
+#define HGCD_THRESHOLD 91
+#define HGCD_APPR_THRESHOLD 105
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 358
+#define GCDEXT_DC_THRESHOLD 351
#define JACOBI_BASE_METHOD 4
-#define GET_STR_DC_THRESHOLD 12
-#define GET_STR_PRECOMPUTE_THRESHOLD 21
-#define SET_STR_DC_THRESHOLD 802
-#define SET_STR_PRECOMPUTE_THRESHOLD 1712
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 27
+#define SET_STR_DC_THRESHOLD 781
+#define SET_STR_PRECOMPUTE_THRESHOLD 1940
diff --git a/mpn/x86_64/div_qr_2n_pi1.asm b/mpn/x86_64/div_qr_2n_pi1.asm
index 9f23012da..c28d0a02c 100644
--- a/mpn/x86_64/div_qr_2n_pi1.asm
+++ b/mpn/x86_64/div_qr_2n_pi1.asm
@@ -44,7 +44,7 @@ C TODO
C * Store qh in the same stack slot as di_param, instead of pushing
C it. (we could put it in register %rbp, but then we would need to
C save and restore that instead, which doesn't seem like a win).
-
+
ASM_START()
TEXT
ALIGN(16)
@@ -56,7 +56,7 @@ PROLOGUE(mpn_div_qr_2n_pi1)
push %r13
push %r12
push %rbx
-
+
mov -16(up, un, 8), u1
mov -8(up, un, 8), u2
@@ -135,5 +135,5 @@ L(fix): C Unlikely update. u2 >= d1
inc t1
sub d0, u1
sbb d1, u2
- jmp L(bck)
+ jmp L(bck)
EPILOGUE()
diff --git a/mpn/x86_64/div_qr_2u_pi1.asm b/mpn/x86_64/div_qr_2u_pi1.asm
index cfc7712d5..bdb64c148 100644
--- a/mpn/x86_64/div_qr_2u_pi1.asm
+++ b/mpn/x86_64/div_qr_2u_pi1.asm
@@ -66,7 +66,7 @@ deflit(`FRAME', 56)
movl shift_param, R32(%rcx)
C FIXME: Different code for SHLD_SLOW
-
+
xor R32(u2), R32(u2)
mov 8(up, un, 8), u1
shld %cl, u1, u2
@@ -173,7 +173,7 @@ L(fix): C Unlikely update. u2 >= d1
inc t1
sub d0, u1
sbb d1, u2
- jmp L(bck)
+ jmp L(bck)
C Duplicated, just jumping back to a different address.
L(fix_qh): C Unlikely update. u2 >= d1
@@ -185,5 +185,5 @@ L(fix_qh): C Unlikely update. u2 >= d1
inc t1
sub d0, u1
sbb d1, u2
- jmp L(bck_qh)
+ jmp L(bck_qh)
EPILOGUE()
diff --git a/mpn/x86_64/dos64.m4 b/mpn/x86_64/dos64.m4
new file mode 100644
index 000000000..ef60834ec
--- /dev/null
+++ b/mpn/x86_64/dos64.m4
@@ -0,0 +1,39 @@
+divert(-1)
+dnl Copyright 2011 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+define(`HOST_DOS64')
+
+define(`JUMPTABSECT', `.section .rdata,"dr"')
+
+define(`DOS64_ENTRY',
+ `push %rdi
+ push %rsi
+ mov %rcx, %rdi
+ifelse(eval($1>=2),1,`dnl
+ mov %rdx, %rsi
+ifelse(eval($1>=3),1,`dnl
+ mov %r8, %rdx
+ifelse(eval($1>=4),1,`dnl
+ mov %r9, %rcx
+')')')')
+
+define(`DOS64_EXIT',
+ `pop %rsi
+ pop %rdi')
+
+divert`'dnl
diff --git a/mpn/x86_64/gmp-mparam.h b/mpn/x86_64/gmp-mparam.h
index 99499da2b..aca6853f0 100644
--- a/mpn/x86_64/gmp-mparam.h
+++ b/mpn/x86_64/gmp-mparam.h
@@ -30,6 +30,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 15
@@ -56,6 +57,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULMOD_BNM1_THRESHOLD 17
#define SQRMOD_BNM1_THRESHOLD 17
+#define POWM_SEC_TABLE 2,67,322,991
+
#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */
#define MUL_FFT_TABLE3 \
{ { 570, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
@@ -187,10 +190,12 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_QR_THRESHOLD 1589
#define MU_BDIV_Q_THRESHOLD 1718
-#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 139
-#define GCD_DC_THRESHOLD 606
-#define GCDEXT_DC_THRESHOLD 474
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD_THRESHOLD 125
+#define HGCD_APPR_THRESHOLD 173
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 555
+#define GCDEXT_DC_THRESHOLD 478
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 12
diff --git a/mpn/x86_64/invert_limb.asm b/mpn/x86_64/invert_limb.asm
index 8c6aa68b6..06cf1414a 100644
--- a/mpn/x86_64/invert_limb.asm
+++ b/mpn/x86_64/invert_limb.asm
@@ -2,7 +2,7 @@ dnl AMD64 mpn_invert_limb -- Invert a normalized limb.
dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
-dnl Copyright 2004, 2007, 2008, 2009 Free Software Foundation, Inc.
+dnl Copyright 2004, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -33,11 +33,14 @@ C VIA nano 79 157
C rax rcx rdx rdi rsi r8
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_invert_limb) C Kn C2 Ci
+ DOS64_ENTRY(1)
mov %rdi, %rax C 0 0 0
shr $55, %rax C 1 1 1
ifdef(`PIC',`
@@ -94,6 +97,7 @@ ifdef(`DARWIN',`
adc %rdi, %rdx
sub %rdx, %rax
+ DOS64_EXIT()
ret
EPILOGUE()
ASM_END()
diff --git a/mpn/x86_64/invert_limb_table.asm b/mpn/x86_64/invert_limb_table.asm
index 98a331372..86d75b8ce 100644
--- a/mpn/x86_64/invert_limb_table.asm
+++ b/mpn/x86_64/invert_limb_table.asm
@@ -21,6 +21,9 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
C Table entry X contains floor (0x7fd00 / (0x100 + X))
diff --git a/mpn/x86_64/logops_n.asm b/mpn/x86_64/logops_n.asm
index 1df564a8f..02b9da549 100644
--- a/mpn/x86_64/logops_n.asm
+++ b/mpn/x86_64/logops_n.asm
@@ -1,6 +1,6 @@
dnl AMD64 logops.
-dnl Copyright 2004, 2005, 2006 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2006, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -72,6 +72,8 @@ define(`up',`%rsi')
define(`vp',`%rdx')
define(`n',`%rcx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
@@ -79,6 +81,7 @@ ifdef(`VARIANT_1',`
TEXT
ALIGN(32)
PROLOGUE(func)
+ DOS64_ENTRY(4)
movq (vp), %r8
movl R32(%rcx), R32(%rax)
leaq (vp,n,8), vp
@@ -117,7 +120,8 @@ L(e10): movq 24(vp,n,8), %r9
movq %r9, 24(rp,n,8)
addq $4, n
jnc L(oop)
-L(ret): ret
+L(ret): DOS64_EXIT()
+ ret
EPILOGUE()
')
@@ -125,6 +129,7 @@ ifdef(`VARIANT_2',`
TEXT
ALIGN(32)
PROLOGUE(func)
+ DOS64_ENTRY(4)
movq (vp), %r8
notq %r8
movl R32(%rcx), R32(%rax)
@@ -168,7 +173,8 @@ L(e10): movq 24(vp,n,8), %r9
movq %r9, 24(rp,n,8)
addq $4, n
jnc L(oop)
-L(ret): ret
+L(ret): DOS64_EXIT()
+ ret
EPILOGUE()
')
@@ -176,6 +182,7 @@ ifdef(`VARIANT_3',`
TEXT
ALIGN(32)
PROLOGUE(func)
+ DOS64_ENTRY(4)
movq (vp), %r8
movl R32(%rcx), R32(%rax)
leaq (vp,n,8), vp
@@ -220,6 +227,7 @@ L(e10): movq 24(vp,n,8), %r9
movq %r9, 24(rp,n,8)
addq $4, n
jnc L(oop)
-L(ret): ret
+L(ret): DOS64_EXIT()
+ ret
EPILOGUE()
')
diff --git a/mpn/x86_64/lshift.asm b/mpn/x86_64/lshift.asm
index 2f3d5c94d..5852ba9f9 100644
--- a/mpn/x86_64/lshift.asm
+++ b/mpn/x86_64/lshift.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_lshift -- mpn left shift.
-dnl Copyright 2003, 2005, 2007, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2007, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -36,10 +36,14 @@ define(`up', `%rsi')
define(`n', `%rdx')
define(`cnt', `%rcx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_lshift)
+ DOS64_ENTRY(4)
cmp $1, R8(%rcx)
jne L(gen)
@@ -83,6 +87,7 @@ L(t1): mov (up), %r8
dec R32(%rax)
jne L(n00)
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
L(e1): test R32(%rax), R32(%rax) C clear cy
L(n00): mov (up), %r8
@@ -91,6 +96,7 @@ L(n00): mov (up), %r8
adc %r8, %r8
mov %r8, (rp)
L(ret): adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
L(n01): dec R32(%rax)
mov 8(up), %r9
@@ -100,6 +106,7 @@ L(n01): dec R32(%rax)
mov %r8, (rp)
mov %r9, 8(rp)
adc R32(%rax), R32(%rax)
+ DOS64_EXIT()
ret
L(n10): mov 16(up), %r10
adc %r8, %r8
@@ -109,6 +116,7 @@ L(n10): mov 16(up), %r10
mov %r9, 8(rp)
mov %r10, 16(rp)
adc $-1, R32(%rax)
+ DOS64_EXIT()
ret
L(gen): neg R32(%rcx) C put rsh count in cl
@@ -222,5 +230,6 @@ L(end):
L(ast): mov (up), %r10
shl R8(%rcx), %r10
mov %r10, (rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/lshiftc.asm b/mpn/x86_64/lshiftc.asm
index 93bb614d3..b4124b037 100644
--- a/mpn/x86_64/lshiftc.asm
+++ b/mpn/x86_64/lshiftc.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_lshiftc -- mpn left shift with complement.
-dnl Copyright 2003, 2005, 2006, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2006, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -36,10 +36,14 @@ define(`up', `%rsi')
define(`n', `%rdx')
define(`cnt', `%rcx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_lshiftc)
+ DOS64_ENTRY(4)
neg R32(%rcx) C put rsh count in cl
mov -8(up,n,8), %rax
shr R8(%rcx), %rax C function return value
@@ -162,5 +166,6 @@ L(ast): mov (up), %r10
shl R8(%rcx), %r10
not %r10
mov %r10, (rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/lshsub_n.asm b/mpn/x86_64/lshsub_n.asm
index 3a42863ad..6e5816b1c 100644
--- a/mpn/x86_64/lshsub_n.asm
+++ b/mpn/x86_64/lshsub_n.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_lshsub_n. R = 2^k(U - V).
-dnl Copyright 2006 Free Software Foundation, Inc.
+dnl Copyright 2006, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -44,10 +44,23 @@ define(`vp', `%rdx')
define(`n', `%rcx')
define(`cnt', `%r8')
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_lshsub_n)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
push %r12
push %r13
@@ -151,5 +164,6 @@ L(end):
pop %r13
pop %r12
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mod_1_1.asm b/mpn/x86_64/mod_1_1.asm
index 6b233e074..8afa96e05 100644
--- a/mpn/x86_64/mod_1_1.asm
+++ b/mpn/x86_64/mod_1_1.asm
@@ -51,7 +51,7 @@ C Note: This implementation needs B1modb only when cnt > 0
C The iteration is almost as follows,
C
C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
-C
+C
C where r2 is a single bit represented as a mask. But to make sure that the
C result fits in two limbs and a bit, carry from the addition
C
@@ -67,10 +67,14 @@ C the source of the cmov in the loop.
C
C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mod_1_1p)
+ DOS64_ENTRY(4)
push %rbp
push %rbx
mov %rdx, b
@@ -163,6 +167,7 @@ L(ok): shr R8(%rcx), %rax
pop %rbx
pop %rbp
+ DOS64_EXIT()
ret
L(fix): sub b, %rax
jmp L(ok)
@@ -170,6 +175,7 @@ EPILOGUE()
ALIGN(16)
PROLOGUE(mpn_mod_1_1p_cps)
+ DOS64_ENTRY(2)
push %rbp
bsr %rsi, %rcx
push %rbx
@@ -206,11 +212,12 @@ ifdef(`SHLD_SLOW',`
')
imul %rdx, %r8
shr R8(%rcx), %r8
- mov %r8, 16(%rbx) C store B1modb
+ mov %r8, 16(%rbx) C store B1modb
L(z):
pop %r12
pop %rbx
pop %rbp
+ DOS64_EXIT()
ret
EPILOGUE()
ASM_END()
diff --git a/mpn/x86_64/mod_1_2.asm b/mpn/x86_64/mod_1_2.asm
index a0ecb6855..b09f24bc0 100644
--- a/mpn/x86_64/mod_1_2.asm
+++ b/mpn/x86_64/mod_1_2.asm
@@ -2,7 +2,7 @@ dnl AMD64 mpn_mod_1s_2p
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -31,10 +31,14 @@ C Intel SBR 4.5
C Intel atom 28
C VIA nano 8
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mod_1s_2p)
+ DOS64_ENTRY(4)
push %r14
test $1, R8(%rsi)
mov %rdx, %r14
@@ -145,6 +149,7 @@ L(1): xor R32(%rcx), R32(%rcx)
pop %r12
pop %r13
pop %r14
+ DOS64_EXIT()
ret
L(one):
mov (%rdi), %r8
@@ -154,6 +159,7 @@ L(one):
EPILOGUE()
PROLOGUE(mpn_mod_1s_2p_cps)
+ DOS64_ENTRY(2)
push %rbp
bsr %rsi, %rcx
push %rbx
@@ -214,5 +220,6 @@ ifdef(`SHLD_SLOW',`
pop %r12
pop %rbx
pop %rbp
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mod_1_4.asm b/mpn/x86_64/mod_1_4.asm
index d99080d7f..629520877 100644
--- a/mpn/x86_64/mod_1_4.asm
+++ b/mpn/x86_64/mod_1_4.asm
@@ -2,7 +2,7 @@ dnl AMD64 mpn_mod_1s_4p
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+dnl Copyright 2009, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -30,17 +30,22 @@ C Intel corei 4
C Intel atom 23
C VIA nano 4.75
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mod_1s_4p)
+ DOS64_ENTRY(4)
+ push %r15
push %r14
push %r13
push %r12
push %rbp
push %rbx
- mov %rdx, -16(%rsp)
+ mov %rdx, %r15
mov %rcx, %r14
mov 16(%rcx), %r11 C B1modb
mov 24(%rcx), %rbx C B2modb
@@ -135,7 +140,7 @@ L(end): mov 8(%r14), R32(%rsi)
or %rdx, %rdi
mov %rdi, %rax
mulq (%r14)
- mov -16(%rsp), %rbx
+ mov %r15, %rbx
mov %rax, %r9
sal R8(%rcx), %r8
inc %rdi
@@ -155,11 +160,14 @@ L(end): mov 8(%r14), R32(%rsi)
pop %r12
pop %r13
pop %r14
+ pop %r15
+ DOS64_EXIT()
ret
EPILOGUE()
ALIGN(16)
PROLOGUE(mpn_mod_1s_4p_cps)
+ DOS64_ENTRY(2)
push %rbp
bsr %rsi, %rcx
push %rbx
@@ -244,5 +252,6 @@ ifdef(`SHLD_SLOW',`
pop %r12
pop %rbx
pop %rbp
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mod_34lsub1.asm b/mpn/x86_64/mod_34lsub1.asm
index 08cd7d939..ee4d0d347 100644
--- a/mpn/x86_64/mod_34lsub1.asm
+++ b/mpn/x86_64/mod_34lsub1.asm
@@ -1,7 +1,7 @@
dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
-dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010 Free Software
-dnl Foundation, Inc.
+dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010, 2011 Free
+dnl Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -39,10 +39,14 @@ C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
C TODO
C * Review feed-in and wind-down code.
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_mod_34lsub1)
+ DOS64_ENTRY(2)
mov $0x0000FFFFFFFFFFFF, %r11
@@ -66,7 +70,8 @@ PROLOGUE(mpn_mod_34lsub1)
shl $16, %rdx C src[1] low
add %rdx, %rax
-L(one): ret
+L(one): DOS64_EXIT()
+ ret
C Don't change this, the wind-down code is not able to handle greater values
@@ -176,5 +181,6 @@ L(0): add %r9, %rax
add %rdx, %rax C apply 2mod3 high
add %rdi, %rax C apply 2mod3 low
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mul_1.asm b/mpn/x86_64/mul_1.asm
index 5f8dc4c9c..3b87bbf01 100644
--- a/mpn/x86_64/mul_1.asm
+++ b/mpn/x86_64/mul_1.asm
@@ -28,38 +28,65 @@ C Intel corei 3.8
C Intel atom 19.8
C VIA nano ?
-C The inner loop of this code is the result of running a code generation and
+C The loop of this code is the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
-C TODO:
-C * The inner loop is great, but the prologue and epilogue code was
-C quickly written. Tune it!
+C TODO
+C * The loop is great, but the prologue and epilogue code was quickly written.
+C Tune it!
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`n_param',`%rdx')
-define(`vl', `%rcx')
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vl', `%rcx') C r9
-define(`n', `%r11')
+define(`n', `%r11')
+
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`vl', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``r11'') ') dnl
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mul_1c)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
push %rbx
- mov %r8, %r10
+IFELF(` mov %r8, %r10')
+IFDOS(` mov 64(%rsp), %r10') C 40 + 3*8 (3 push insns)
jmp L(common)
EPILOGUE()
PROLOGUE(mpn_mul_1)
+
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
push %rbx
xor %r10, %r10
L(common):
mov (up), %rax C read first u limb early
- mov n_param, %rbx C move away n from rdx, mul uses it
+IFELF(` mov n_param, %rbx ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %rbx ')
mul vl
- mov %rbx, %r11
+IFELF(` mov %rbx, n ')
add %r10, %rax
adc $0, %rdx
@@ -145,5 +172,7 @@ L(L2): mul vl
L(ret): mov %rdx, %rax
pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
ret
EPILOGUE()
diff --git a/mpn/x86_64/mul_2.asm b/mpn/x86_64/mul_2.asm
index 206a4ea2c..35deefa8b 100644
--- a/mpn/x86_64/mul_2.asm
+++ b/mpn/x86_64/mul_2.asm
@@ -1,7 +1,7 @@
dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
dnl store the result in a third limb vector.
-dnl Copyright 2008 Free Software Foundation, Inc.
+dnl Copyright 2008, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -53,10 +53,14 @@ define(`w2', `%rbp')
define(`w3', `%r10')
define(`n', `%r11')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mul_2)
+ DOS64_ENTRY(4)
push %rbx
push %rbp
@@ -172,5 +176,6 @@ L(m22): mul v1
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mul_basecase.asm b/mpn/x86_64/mul_basecase.asm
index fdba9a6e3..5fede9234 100644
--- a/mpn/x86_64/mul_basecase.asm
+++ b/mpn/x86_64/mul_basecase.asm
@@ -59,10 +59,23 @@ define(`n', `%r11')
define(`outer_addr', `%r14')
define(`un', `%r13')
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mul_basecase)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
push %rbx
push %rbp
push %r12
@@ -448,6 +461,7 @@ L(ret): pop %r15
pop %r12
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/mulmid_basecase.asm b/mpn/x86_64/mulmid_basecase.asm
index 375e7f70e..d2d56d4a4 100644
--- a/mpn/x86_64/mulmid_basecase.asm
+++ b/mpn/x86_64/mulmid_basecase.asm
@@ -50,11 +50,23 @@ define(`vp', `%r15')
define(`vp_inner', `%r10')
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mulmid_basecase)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
push %rbx
push %rbp
push %r12
@@ -539,6 +551,6 @@ L(ret): pop %r15
pop %r12
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
-
EPILOGUE()
diff --git a/mpn/x86_64/nano/gmp-mparam.h b/mpn/x86_64/nano/gmp-mparam.h
index a1c556937..7ee41927b 100644
--- a/mpn/x86_64/nano/gmp-mparam.h
+++ b/mpn/x86_64/nano/gmp-mparam.h
@@ -34,6 +34,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0
#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 22
@@ -50,13 +51,17 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 34
-#define SQR_TOOM3_THRESHOLD 74
-#define SQR_TOOM4_THRESHOLD 620
-#define SQR_TOOM6_THRESHOLD 960
-#define SQR_TOOM8_THRESHOLD 1065
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 592
+#define SQR_TOOM6_THRESHOLD 978
+#define SQR_TOOM8_THRESHOLD 1193
-#define MULMOD_BNM1_THRESHOLD 15
-#define SQRMOD_BNM1_THRESHOLD 17
+#define MULMID_TOOM42_THRESHOLD 28
+
+#define MULMOD_BNM1_THRESHOLD 16
+#define SQRMOD_BNM1_THRESHOLD 20
+
+#define POWM_SEC_TABLE 2,29,387,1421
#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -176,7 +181,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define SQR_FFT_TABLE3_SIZE 215
#define SQR_FFT_THRESHOLD 3264
-#define MULLO_BASECASE_THRESHOLD 17
+#define MULLO_BASECASE_THRESHOLD 8
#define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */
#define MULLO_MUL_N_THRESHOLD 6633
@@ -190,7 +195,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define INV_APPR_THRESHOLD 153
#define BINV_NEWTON_THRESHOLD 182
-#define REDC_1_TO_REDC_2_THRESHOLD 14
+#define REDC_1_TO_REDC_2_THRESHOLD 20
#define REDC_2_TO_REDC_N_THRESHOLD 75
#define MU_DIV_QR_THRESHOLD 1589
@@ -200,12 +205,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 1528
#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 84
-#define GCD_DC_THRESHOLD 465
-#define GCDEXT_DC_THRESHOLD 456
+#define HGCD_THRESHOLD 102
+#define HGCD_APPR_THRESHOLD 113
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 706
+#define GCDEXT_DC_THRESHOLD 465
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 12
#define GET_STR_PRECOMPUTE_THRESHOLD 24
-#define SET_STR_DC_THRESHOLD 537
-#define SET_STR_PRECOMPUTE_THRESHOLD 1639
+#define SET_STR_DC_THRESHOLD 381
+#define SET_STR_PRECOMPUTE_THRESHOLD 1794
diff --git a/mpn/x86_64/pentium4/gmp-mparam.h b/mpn/x86_64/pentium4/gmp-mparam.h
index 8983304c2..4d49fc2cf 100644
--- a/mpn/x86_64/pentium4/gmp-mparam.h
+++ b/mpn/x86_64/pentium4/gmp-mparam.h
@@ -33,34 +33,39 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MOD_1_NORM_THRESHOLD 0 /* always */
#define MOD_1_UNNORM_THRESHOLD 0 /* always */
#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
-#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
-#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD 32
-#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 38
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
#define BMOD_1_TO_MOD_1_THRESHOLD 20
#define MUL_TOOM22_THRESHOLD 12
-#define MUL_TOOM33_THRESHOLD 66
+#define MUL_TOOM33_THRESHOLD 74
#define MUL_TOOM44_THRESHOLD 118
#define MUL_TOOM6H_THRESHOLD 157
-#define MUL_TOOM8H_THRESHOLD 242
+#define MUL_TOOM8H_THRESHOLD 430
#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
-#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
-#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145
#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
#define SQR_TOOM2_THRESHOLD 20
-#define SQR_TOOM3_THRESHOLD 77
-#define SQR_TOOM4_THRESHOLD 214
+#define SQR_TOOM3_THRESHOLD 69
+#define SQR_TOOM4_THRESHOLD 202
#define SQR_TOOM6_THRESHOLD 254
-#define SQR_TOOM8_THRESHOLD 454
+#define SQR_TOOM8_THRESHOLD 418
+
+#define MULMID_TOOM42_THRESHOLD 19
#define MULMOD_BNM1_THRESHOLD 10
-#define SQRMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 9
+
+#define POWM_SEC_TABLE 3,130,140,724,2316
#define MUL_FFT_MODF_THRESHOLD 236 /* k = 5 */
#define MUL_FFT_TABLE3 \
@@ -121,11 +126,11 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MULLO_BASECASE_THRESHOLD 0 /* always */
#define MULLO_DC_THRESHOLD 32
-#define MULLO_MUL_N_THRESHOLD 5397
+#define MULLO_MUL_N_THRESHOLD 6253
-#define DC_DIV_QR_THRESHOLD 28
-#define DC_DIVAPPR_Q_THRESHOLD 67
-#define DC_BDIV_QR_THRESHOLD 27
+#define DC_DIV_QR_THRESHOLD 32
+#define DC_DIVAPPR_Q_THRESHOLD 60
+#define DC_BDIV_QR_THRESHOLD 26
#define DC_BDIV_Q_THRESHOLD 49
#define INV_MULMOD_BNM1_THRESHOLD 22
@@ -133,8 +138,8 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define INV_APPR_THRESHOLD 101
#define BINV_NEWTON_THRESHOLD 199
-#define REDC_1_TO_REDC_2_THRESHOLD 13
-#define REDC_2_TO_REDC_N_THRESHOLD 44
+#define REDC_1_TO_REDC_2_THRESHOLD 23
+#define REDC_2_TO_REDC_N_THRESHOLD 42
#define MU_DIV_QR_THRESHOLD 979
#define MU_DIVAPPR_Q_THRESHOLD 979
@@ -143,12 +148,14 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#define MU_BDIV_Q_THRESHOLD 979
#define MATRIX22_STRASSEN_THRESHOLD 17
-#define HGCD_THRESHOLD 101
-#define GCD_DC_THRESHOLD 222
-#define GCDEXT_DC_THRESHOLD 222
+#define HGCD_THRESHOLD 99
+#define HGCD_APPR_THRESHOLD 117
+#define HGCD_REDUCE_THRESHOLD 1679
+#define GCD_DC_THRESHOLD 198
+#define GCDEXT_DC_THRESHOLD 233
#define JACOBI_BASE_METHOD 4
#define GET_STR_DC_THRESHOLD 12
#define GET_STR_PRECOMPUTE_THRESHOLD 26
-#define SET_STR_DC_THRESHOLD 248
+#define SET_STR_DC_THRESHOLD 422
#define SET_STR_PRECOMPUTE_THRESHOLD 1438
diff --git a/mpn/x86_64/popham.asm b/mpn/x86_64/popham.asm
index 9db368106..999452328 100644
--- a/mpn/x86_64/popham.asm
+++ b/mpn/x86_64/popham.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
-dnl Copyright 2004, 2005, 2007, 2010 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2007, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -44,6 +44,7 @@ ifdef(`OPERATION_popcount',`
define(`h33333333', `%r11')
define(`h0f0f0f0f', `%rcx')
define(`h01010101', `%rdx')
+ define(`POP', `$1')
define(`HAM', `dnl')
')
ifdef(`OPERATION_hamdist',`
@@ -55,17 +56,22 @@ ifdef(`OPERATION_hamdist',`
define(`h33333333', `%r11')
define(`h0f0f0f0f', `%rcx')
define(`h01010101', `%r14')
+ define(`POP', `dnl')
define(`HAM', `$1')
')
MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(func)
-
+ POP(` DOS64_ENTRY(2) ')
+ HAM(` DOS64_ENTRY(3) ')
push %r12
push %r13
HAM(` push %r14 ')
@@ -155,6 +161,6 @@ L(end):
HAM(` pop %r14 ')
pop %r13
pop %r12
+ DOS64_EXIT()
ret
-
EPILOGUE()
diff --git a/mpn/x86_64/redc_1.asm b/mpn/x86_64/redc_1.asm
index 976cab2bc..53b5641a0 100644
--- a/mpn/x86_64/redc_1.asm
+++ b/mpn/x86_64/redc_1.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_redc_1 -- Montgomery reduction with a one-limb modular inverse.
-dnl Copyright 2004, 2008 Free Software Foundation, Inc.
+dnl Copyright 2004, 2008, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -34,41 +34,40 @@ C TODO
C * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code.
C The code for 1, 2, 3, 4 should perhaps be completely register based.
C * Perhaps align outer loops.
-C * The sub_n at the end leaks side-channel data. How do we fix that?
-C * Write mpn_add_n_sub_n computing R = A + B - C. It should run at 2 c/l.
C * We could software pipeline the IMUL stuff, by putting it before the
C outer loops and before the end of the outer loops. The last outer
C loop iteration would then compute an unneeded product, but it is at
C least not a stray read from up[], since it is at up[n].
-C * Can we combine both the add_n and sub_n into the loops, somehow?
C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`param_mp',`%rdx')
-define(`n', `%rcx')
-define(`invm', `%r8')
+define(`up', `%rdi')
+define(`mp', `%rsi')
+define(`n_param', `%rdx')
+define(`invm', `%rcx')
-define(`mp', `%r13')
+define(`n', `%r13')
define(`i', `%r11')
define(`nneg', `%r12')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_redc_1)
+ DOS64_ENTRY(4)
push %rbp
push %rbx
push %r12
push %r13
push %r14
- push n
- sub $8, %rsp C maintain ABI required rsp alignment
- lea (param_mp,n,8), mp C mp += n
- lea (up,n,8), up C up += n
+ lea (mp,n_param,8), mp C mp += n
+ lea (up,n_param,8), up C up += n
- mov n, nneg
+ mov n_param, nneg
+ mov n_param, n
neg nneg
mov R32(n), R32(%rax)
@@ -136,9 +135,7 @@ L(n1): mov %r14, 16(up,nneg,8) C up[0]
add $8, up
dec n
jnz L(o1)
-C lea (mp), mp
- lea 16(up), up
- jmp L(common)
+ jmp L(ret)
L(b0): C lea (mp), mp
lea -16(up), up
@@ -190,10 +187,7 @@ L(ed0): add %r10, (up)
add $8, up
dec n
jnz L(o0)
-C lea (mp), mp
- lea 16(up), up
- jmp L(common)
-
+ jmp L(ret)
L(b3): lea -8(mp), mp
lea -24(up), up
@@ -244,9 +238,7 @@ L(ed3): add %r10, 8(up)
add $8, up
dec n
jnz L(o3)
- lea 8(mp), mp
- lea 24(up), up
- jmp L(common)
+ jmp L(ret)
L(b2): lea -16(mp), mp
lea -32(up), up
@@ -299,39 +291,12 @@ L(ed2): add %r10, 16(up)
add $8, up
dec n
jnz L(o2)
- lea 16(mp), mp
- lea 32(up), up
-
-
-L(common):
- lea (mp,nneg,8), mp C restore entry mp
-
-C cy = mpn_add_n (rp, up, up - n, n);
-C rdi rsi rdx rcx
- lea (up,nneg,8), up C up -= n
- lea (up,nneg,8), %rdx C rdx = up - n [up entry value]
- mov rp, nneg C preserve rp over first call
- mov 8(%rsp), %rcx C pass entry n
-C mov rp, %rdi
- CALL( mpn_add_n)
- test R32(%rax), R32(%rax)
- jz L(ret)
-
-C mpn_sub_n (rp, rp, mp, n);
-C rdi rsi rdx rcx
- mov nneg, %rdi
- mov nneg, %rsi
- mov mp, %rdx
- mov 8(%rsp), %rcx C pass entry n
- CALL( mpn_sub_n)
-L(ret):
- add $8, %rsp
- pop n C just increment rsp
- pop %r14
+L(ret): pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/rsh1aors_n.asm b/mpn/x86_64/rsh1aors_n.asm
index c4a336446..1b6a103f1 100644
--- a/mpn/x86_64/rsh1aors_n.asm
+++ b/mpn/x86_64/rsh1aors_n.asm
@@ -1,7 +1,7 @@
dnl AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1
dnl AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1
-dnl Copyright 2003, 2005, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2009, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -53,11 +53,24 @@ ifdef(`OPERATION_rsh1sub_n', `
MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(func_nc)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
push %rbx
xor R32(%rax), R32(%rax)
@@ -69,6 +82,7 @@ EPILOGUE()
ALIGN(16)
PROLOGUE(func_n)
+ DOS64_ENTRY(4)
push %rbx
xor R32(%rax), R32(%rax)
@@ -169,5 +183,6 @@ L(top): add %rbx, %rbx C rotate carry limb, restore acy
L(end): mov %rbx, (rp)
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/rshift.asm b/mpn/x86_64/rshift.asm
index 0f822a4a0..57a4ab093 100644
--- a/mpn/x86_64/rshift.asm
+++ b/mpn/x86_64/rshift.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_rshift -- mpn right shift.
-dnl Copyright 2003, 2005, 2009 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2009, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -36,10 +36,14 @@ define(`up', `%rsi')
define(`n', `%rdx')
define(`cnt', `%rcx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_rshift)
+ DOS64_ENTRY(4)
neg R32(%rcx) C put rsh count in cl
mov (up), %rax
shl R8(%rcx), %rax C function return value
@@ -156,5 +160,6 @@ L(end):
L(ast): mov (up), %r10
shr R8(%rcx), %r10
mov %r10, (rp)
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/sqr_basecase.asm b/mpn/x86_64/sqr_basecase.asm
index 311daab8a..71195d7ae 100644
--- a/mpn/x86_64/sqr_basecase.asm
+++ b/mpn/x86_64/sqr_basecase.asm
@@ -75,12 +75,14 @@ define(`w1', `%rcx')
define(`w2', `%rbp')
define(`w3', `%r10')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
ASM_START()
TEXT
ALIGN(16)
-
PROLOGUE(mpn_sqr_basecase)
+ DOS64_ENTRY(3)
add $-40, %rsp
mov %rbx, 32(%rsp)
mov %rbp, 24(%rsp)
@@ -115,6 +117,7 @@ L(1): mov (up), %rax
mov %rdx, 8(rp)
add $32, %rsp
pop %rbx
+ DOS64_EXIT()
ret
L(2): mov (up), %rax
@@ -139,6 +142,7 @@ L(2): mov (up), %rax
mov %r11, 24(rp)
add $32, %rsp
pop %rbx
+ DOS64_EXIT()
ret
L(3): mov (up), %rax
@@ -184,6 +188,7 @@ L(3): mov (up), %rax
adc %rbx, 40(rp)
add $32, %rsp
pop %rbx
+ DOS64_EXIT()
ret
L(4): mov (up), %rax
@@ -256,6 +261,7 @@ L(4): mov (up), %rax
pop %r12
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
@@ -780,5 +786,6 @@ L(d1): mov %r11, 24(rp,j,8)
pop %r12
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/sublsh1_n.asm b/mpn/x86_64/sublsh1_n.asm
index a2f48c007..a0515cf18 100644
--- a/mpn/x86_64/sublsh1_n.asm
+++ b/mpn/x86_64/sublsh1_n.asm
@@ -1,6 +1,6 @@
dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
-dnl Copyright 2003, 2005, 2006, 2007 Free Software Foundation, Inc.
+dnl Copyright 2003, 2005, 2006, 2007, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -41,10 +41,14 @@ define(`up',`%rsi')
define(`vp',`%rdx')
define(`n', `%rcx')
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_sublsh1_n)
+ DOS64_ENTRY(4)
push %rbx
push %rbp
@@ -140,5 +144,6 @@ L(end): add R32(%rbp), R32(%rax)
pop %rbp
pop %rbx
+ DOS64_EXIT()
ret
EPILOGUE()
diff --git a/mpn/x86_64/tabselect.asm b/mpn/x86_64/tabselect.asm
new file mode 100644
index 000000000..a6699a9a4
--- /dev/null
+++ b/mpn/x86_64/tabselect.asm
@@ -0,0 +1,123 @@
+dnl AMD64 mpn_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 2.5
+C AMD K10 2.5
+C AMD bobcat 3.5
+C Intel P4 4
+C Intel core2 2.33
+C Intel NHM 2.5
+C Intel SBR 2.2
+C Intel atom 5
+C VIA nano 3.5
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using SSE2/AVX2 could result in many-fold speedup.
+
+C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `%rdi')
+define(`tp', `%rsi')
+define(`n', `%rdx')
+define(`nents', `%rcx')
+define(`which', `%r8')
+
+define(`i', `%rbp')
+define(`maskp', `%r11')
+define(`maskn', `%r12')
+
+C rax rbx rcx rdx rdi rsi rbp (rsp) r8 r9 r10 r11 r12 r13 r14 r15
+C nents n rp tab which
+
+ifdef(`HOST_DOS64',`
+ define(`IFDOS', `$1')
+ define(`IFELF', `')
+',`
+ define(`IFDOS', `')
+ define(`IFELF', `$1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(ELF64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_tabselect)
+ DOS64_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ push %r12
+
+ lea (rp,n,8), rp
+ lea (tp,n,8), tp
+ sub nents, which
+L(outer):
+ lea (which,nents), %rax
+ neg %rax C set CF iff 'which' != k
+ sbb maskn, maskn
+ mov maskn, maskp
+ not maskp
+
+ mov n, i
+ neg i
+ test $1, R32(n)
+ je L(top)
+ mov (tp,i,8), %rax
+ and maskp, %rax
+ mov (rp,i,8), %r9
+ and maskn, %r9
+ or %r9, %rax
+ mov %rax, (rp,i,8)
+ add $1, i
+ jns L(end)
+
+ ALIGN(16)
+L(top): mov (tp,i,8), %rax
+ mov 8(tp,i,8), %rbx
+ and maskp, %rax
+ and maskp, %rbx
+ mov (rp,i,8), %r9
+ mov 8(rp,i,8), %r10
+ and maskn, %r9
+ and maskn, %r10
+ or %r9, %rax
+ or %r10, %rbx
+ mov %rax, (rp,i,8)
+ mov %rbx, 8(rp,i,8)
+ add $2, i
+ js L(top)
+
+L(end): lea (tp,n,8), tp
+ dec nents
+ jne L(outer)
+
+L(outer_end):
+ pop %r12
+ pop %rbp
+ pop %rbx
+ DOS64_EXIT()
+ ret
+EPILOGUE()
diff --git a/mpn/x86_64/x86_64-defs.m4 b/mpn/x86_64/x86_64-defs.m4
index 6942a7882..79d7b3cf2 100644
--- a/mpn/x86_64/x86_64-defs.m4
+++ b/mpn/x86_64/x86_64-defs.m4
@@ -2,8 +2,8 @@ divert(-1)
dnl m4 macros for amd64 assembler.
-dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free
-dnl Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009, 2011
+dnl Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -169,4 +169,7 @@ ifdef(`PIC',
define(`JUMPTABSECT', `.section .data.rel.ro.local,"aw",@progbits')
+define(`DOS64_ENTRY',`')
+define(`DOS64_EXIT',`')
+
divert`'dnl