diff options
author | Niels Möller <nisse@lysator.liu.se> | 2011-10-03 13:30:35 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2011-10-03 13:30:35 +0200 |
commit | 730b95e57ffcc38a364b0cd1f5352aaf6ea361c4 (patch) | |
tree | b82237bffd1a08089e40a6ab4fb122f04cc37d21 /tune | |
parent | ab5d3c7f9553c3a3058813b7076d061794a57b33 (diff) | |
download | gmp-730b95e57ffcc38a364b0cd1f5352aaf6ea361c4.tar.gz |
Tuning of mulmid.
Diffstat (limited to 'tune')
-rw-r--r-- | tune/Makefile.am | 1 | ||||
-rw-r--r-- | tune/common.c | 56 | ||||
-rw-r--r-- | tune/speed.c | 12 | ||||
-rw-r--r-- | tune/speed.h | 178 | ||||
-rw-r--r-- | tune/tuneup.c | 16 |
5 files changed, 262 insertions, 1 deletions
diff --git a/tune/Makefile.am b/tune/Makefile.am index c932cd3ab..e54c020d4 100644 --- a/tune/Makefile.am +++ b/tune/Makefile.am @@ -131,6 +131,7 @@ TUNE_MPN_SRCS_BASIC = div_qr_2.c bdiv_q.c bdiv_qr.c \ invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c \ get_str.c set_str.c matrix22_mul.c hgcd.c mul_n.c sqr.c \ mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c \ + mulmid.c mulmid_n.c toom42_mulmid.c \ nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c \ toom22_mul.c toom2_sqr.c toom33_mul.c toom3_sqr.c toom44_mul.c toom4_sqr.c diff --git a/tune/common.c b/tune/common.c index 63d582399..293474014 100644 --- a/tune/common.c +++ b/tune/common.c @@ -926,6 +926,38 @@ speed_mpn_sub_n (struct speed_params *s) SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n); } +double +speed_mpn_add_err1_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_add_err1_n); +} +double +speed_mpn_sub_err1_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_sub_err1_n); +} +double +speed_mpn_add_err2_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_add_err2_n); +} +double +speed_mpn_sub_err2_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_sub_err2_n); +} +double +speed_mpn_add_err3_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_add_err3_n); +} +double +speed_mpn_sub_err3_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_sub_err3_n); +} + + #if HAVE_NATIVE_mpn_add_n_sub_n double speed_mpn_add_n_sub_n (struct speed_params *s) @@ -1369,6 +1401,30 @@ speed_mpn_mullo_basecase (struct speed_params *s) } double +speed_mpn_mulmid_basecase (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMID (mpn_mulmid_basecase); +} + +double +speed_mpn_mulmid (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMID (mpn_mulmid); +} + +double +speed_mpn_mulmid_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMID_N (mpn_mulmid_n); +} + +double +speed_mpn_toom42_mulmid (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM42_MULMID (mpn_toom42_mulmid); +} + +double speed_mpn_mulmod_bnm1 (struct speed_params *s) { SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_mulmod_bnm1 (wp, s->size, s->xp, s->size, s->yp, s->size, tp)); diff --git a/tune/speed.c b/tune/speed.c index 27894a868..245318bb3 100644 --- a/tune/speed.c +++ b/tune/speed.c @@ -153,6 +153,13 @@ const struct routine_t { { "mpn_add_n", speed_mpn_add_n, FLAG_R_OPTIONAL }, { "mpn_sub_n", speed_mpn_sub_n, FLAG_R_OPTIONAL }, + { "mpn_add_err1_n", speed_mpn_add_err1_n }, + { "mpn_add_err2_n", speed_mpn_add_err2_n }, + { "mpn_add_err3_n", speed_mpn_add_err3_n }, + { "mpn_sub_err1_n", speed_mpn_sub_err1_n }, + { "mpn_sub_err2_n", speed_mpn_sub_err2_n }, + { "mpn_sub_err3_n", speed_mpn_sub_err3_n }, + #if HAVE_NATIVE_mpn_add_n_sub_n { "mpn_add_n_sub_n", speed_mpn_add_n_sub_n, FLAG_R_OPTIONAL }, #endif @@ -332,6 +339,11 @@ const struct routine_t { { "mpn_mullo_n", speed_mpn_mullo_n }, { "mpn_mullo_basecase", speed_mpn_mullo_basecase }, + { "mpn_mulmid_basecase", speed_mpn_mulmid_basecase, FLAG_R_OPTIONAL }, + { "mpn_toom42_mulmid", speed_mpn_toom42_mulmid }, + { "mpn_mulmid_n", speed_mpn_mulmid_n }, + { "mpn_mulmid", speed_mpn_mulmid, FLAG_R_OPTIONAL }, + { "mpn_bc_mulmod_bnm1", speed_mpn_bc_mulmod_bnm1 }, { "mpn_mulmod_bnm1", speed_mpn_mulmod_bnm1 }, { "mpn_mulmod_bnm1_rounded", speed_mpn_mulmod_bnm1_rounded }, diff --git a/tune/speed.h b/tune/speed.h index 48a420668..0074d4f7f 100644 --- a/tune/speed.h +++ b/tune/speed.h @@ -117,7 +117,7 @@ struct speed_params { struct { mp_ptr ptr; mp_size_t size; - } src[3], dst[4]; + } src[5], dst[4]; }; typedef double (*speed_function_t) __GMP_PROTO ((struct speed_params *s)); @@ -145,6 +145,9 @@ double speed_binvert_limb_arith __GMP_PROTO ((struct speed_params *s)); double speed_mpf_init_clear __GMP_PROTO ((struct speed_params *s)); double speed_mpn_add_n __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_add_err1_n __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_add_err2_n __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_add_err3_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_addlsh_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_addlsh1_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_addlsh2_n __GMP_PROTO ((struct speed_params *s)); @@ -234,6 +237,8 @@ double speed_mpn_mul_5 __GMP_PROTO ((struct speed_params *s)); double speed_mpn_mul_6 __GMP_PROTO ((struct speed_params *s)); double speed_mpn_mul __GMP_PROTO ((struct speed_params *s)); double speed_mpn_mul_basecase __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_mulmid __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_mulmid_basecase __GMP_PROTO ((struct speed_params *s)); double speed_mpn_mul_fft __GMP_PROTO ((struct speed_params *s)); double speed_mpn_mul_fft_sqr __GMP_PROTO ((struct speed_params *s)); double speed_mpn_fft_mul __GMP_PROTO ((struct speed_params *s)); @@ -246,6 +251,7 @@ double speed_mpn_nussbaumer_mul __GMP_PROTO ((struct speed_params *s)); double speed_mpn_nussbaumer_mul_sqr __GMP_PROTO ((struct speed_params *s)); double speed_mpn_mul_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_mul_n_sqr __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_mulmid_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_mullo_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_mullo_basecase __GMP_PROTO ((struct speed_params *s)); double speed_mpn_nand_n __GMP_PROTO ((struct speed_params *s)); @@ -294,6 +300,9 @@ double speed_mpn_sqr __GMP_PROTO ((struct speed_params *s)); double speed_mpn_sqrtrem __GMP_PROTO ((struct speed_params *s)); double speed_mpn_rootrem __GMP_PROTO ((struct speed_params *s)); double speed_mpn_sub_n __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_sub_err1_n __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_sub_err2_n __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_sub_err3_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_sublsh_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_sublsh1_n __GMP_PROTO ((struct speed_params *s)); double speed_mpn_sublsh2_n __GMP_PROTO ((struct speed_params *s)); @@ -321,6 +330,7 @@ double speed_mpn_toom32_for_toom53_mul __GMP_PROTO ((struct speed_params *s)); double speed_mpn_toom53_for_toom32_mul __GMP_PROTO ((struct speed_params *s)); double speed_mpn_toom42_for_toom53_mul __GMP_PROTO ((struct speed_params *s)); double speed_mpn_toom53_for_toom42_mul __GMP_PROTO ((struct speed_params *s)); +double speed_mpn_toom42_mulmid __GMP_PROTO ((struct speed_params *s)); double speed_mpn_mulmod_bnm1 __GMP_PROTO ((struct speed_params *s)); double speed_mpn_bc_mulmod_bnm1 __GMP_PROTO ((struct speed_params *s)); double speed_mpn_mulmod_bnm1_rounded __GMP_PROTO ((struct speed_params *s)); @@ -712,6 +722,72 @@ int speed_routine_count_zeros_setup return t; \ } + +/* For mpn_aors_errK_n, where 1 <= K <= 3. */ +#define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K) \ + { \ + mp_ptr wp; \ + mp_ptr xp, yp; \ + mp_ptr zp[K]; \ + mp_limb_t ep[2*K]; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + /* (don't have a mechnanism to specify zp alignments) */ \ + for (i = 0; i < K; i++) \ + SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0); \ + \ + xp = s->xp; \ + yp = s->yp; \ + \ + if (s->r == 0) ; \ + else if (s->r == 1) { xp = wp; } \ + else if (s->r == 2) { yp = wp; } \ + else if (s->r == 3) { xp = wp; yp = wp; } \ + else if (s->r == 4) { yp = xp; } \ + else { \ + TMP_FREE; \ + return -1.0; \ + } \ + \ + /* initialize wp if operand overlap */ \ + if (xp == wp || yp == wp) \ + MPN_COPY (wp, s->xp, s->size); \ + \ + speed_operand_src (s, xp, s->size); \ + speed_operand_src (s, yp, s->size); \ + for (i = 0; i < K; i++) \ + speed_operand_src (s, zp[i], s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function) \ + SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1) + +#define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function) \ + SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2) + +#define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function) \ + SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3) + + /* For mpn_add_n, mpn_sub_n, or similar. */ #define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call) \ { \ @@ -1050,6 +1126,106 @@ int speed_routine_count_zeros_setup return t; \ } +/* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */ +#define SPEED_ROUTINE_MPN_MULMID(function) \ + { \ + mp_ptr wp, xp; \ + mp_size_t size1; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + size1 = (s->r == 0 ? (2 * s->size - 1) : s->r); \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + SPEED_RESTRICT_COND (size1 >= s->size); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ + \ + speed_operand_src (s, xp, size1); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, size1 - s->size + 3); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, xp, size1, s->yp, s->size); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_MULMID_N(function) \ + { \ + mp_ptr wp, xp; \ + mp_size_t size1; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + size1 = 2 * s->size - 1; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ + \ + speed_operand_src (s, xp, size1); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, size1 - s->size + 3); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, xp, s->yp, s->size); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_TOOM42_MULMID(function) \ + { \ + mp_ptr wp, xp, scratch; \ + mp_size_t size1, scratch_size; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + size1 = 2 * s->size - 1; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ + scratch_size = mpn_toom42_mulmid_itch (s->size); \ + SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0); \ + \ + speed_operand_src (s, xp, size1); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, size1 - s->size + 3); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, xp, s->yp, s->size, scratch); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + #define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call) \ { \ mp_ptr wp, tp; \ diff --git a/tune/tuneup.c b/tune/tuneup.c index 88ee2158b..4f53c979c 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -170,6 +170,7 @@ mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX; mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX; mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX; mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX; +mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX; mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX; mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX; mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX; @@ -1345,6 +1346,18 @@ tune_mullo (void) } void +tune_mulmid (void) +{ + static struct param_t param; + + param.name = "MULMID_TOOM42_THRESHOLD"; + param.function = speed_mpn_mulmid_n; + param.min_size = 4; + param.max_size = 100; + one (&mulmid_toom42_threshold, ¶m); +} + +void tune_mulmod_bnm1 (void) { static struct param_t param; @@ -2532,6 +2545,9 @@ all (void) tune_sqr (); printf("\n"); + tune_mulmid (); + printf("\n"); + tune_mulmod_bnm1 (); tune_sqrmod_bnm1 (); printf("\n"); |