diff options
author | Richard Maw <richard.maw@codethink.co.uk> | 2012-01-19 10:33:31 +0000 |
---|---|---|
committer | Richard Maw <richard.maw@codethink.co.uk> | 2012-01-19 10:33:31 +0000 |
commit | 29137c6ff7a9e370e2332d855ab46616ad4e9cc9 (patch) | |
tree | fbca7aa7cfa645df1b059aeba7e81739620b013c /mpn/ia64 | |
parent | 962de8d4b353178d38c2c70e952944686b9fd47b (diff) | |
parent | 2c033efc02631f22e6e180ce737a2faf81b09ccc (diff) | |
download | gmp-29137c6ff7a9e370e2332d855ab46616ad4e9cc9.tar.gz |
Merge branch 'master' into baserock/morph
Diffstat (limited to 'mpn/ia64')
-rw-r--r-- | mpn/ia64/gmp-mparam.h | 148 | ||||
-rw-r--r-- | mpn/ia64/tabselect.asm | 139 |
2 files changed, 237 insertions, 50 deletions
diff --git a/mpn/ia64/gmp-mparam.h b/mpn/ia64/gmp-mparam.h index 0841c82aa..77e02f518 100644 --- a/mpn/ia64/gmp-mparam.h +++ b/mpn/ia64/gmp-mparam.h @@ -1,6 +1,6 @@ /* gmp-mparam.h -- Compiler/machine parameter header file. -Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2009, 2010 Free Software +Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2009, 2010, 2011 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -21,70 +21,94 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define GMP_LIMB_BITS 64 #define BYTES_PER_MP_LIMB 8 -/* 1300MHz Itanium2 (babe.fsffrance.org) */ - +/* 900MHz Itanium2 (titanic.gmplib.org) */ +#define MOD_1_1P_METHOD 2 #define MOD_1_NORM_THRESHOLD 0 /* always */ #define MOD_1_UNNORM_THRESHOLD 0 /* always */ #define MOD_1N_TO_MOD_1_1_THRESHOLD 4 -#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 -#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ -#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21 -#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 26 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 #define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD 12 #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ #define MUL_TOOM22_THRESHOLD 40 -#define MUL_TOOM33_THRESHOLD 122 -#define MUL_TOOM44_THRESHOLD 212 +#define MUL_TOOM33_THRESHOLD 129 +#define MUL_TOOM44_THRESHOLD 214 #define MUL_TOOM6H_THRESHOLD 318 #define MUL_TOOM8H_THRESHOLD 430 -#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 146 -#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129 +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 145 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 126 #define MUL_TOOM42_TO_TOOM63_THRESHOLD 151 #define SQR_BASECASE_THRESHOLD 11 #define SQR_TOOM2_THRESHOLD 84 -#define SQR_TOOM3_THRESHOLD 125 +#define SQR_TOOM3_THRESHOLD 135 #define SQR_TOOM4_THRESHOLD 494 -#define SQR_TOOM6_THRESHOLD 0 /* never toom4 */ -#define SQR_TOOM8_THRESHOLD 0 /* never toom6 */ +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 98 #define MULMOD_BNM1_THRESHOLD 23 -#define SQRMOD_BNM1_THRESHOLD 25 +#define SQRMOD_BNM1_THRESHOLD 28 + +#define POWM_SEC_TABLE 2,29,130,905 -#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */ #define MUL_FFT_TABLE3 \ - { { 444, 5}, { 27, 6}, { 14, 5}, { 29, 6}, \ - { 35, 7}, { 18, 6}, { 37, 7}, { 19, 6}, \ + { { 476, 5}, { 27, 6}, { 14, 5}, { 29, 6}, \ + { 33, 7}, { 17, 6}, { 37, 7}, { 19, 6}, \ { 39, 7}, { 21, 6}, { 43, 7}, { 33, 8}, \ { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \ - { 21, 7}, { 43, 8}, { 29, 9}, { 15, 8}, \ - { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ - { 49, 9}, { 27, 8}, { 57, 9}, { 31, 8}, \ - { 63, 9}, { 35, 8}, { 71, 9}, { 43,10}, \ + { 21, 7}, { 43, 8}, { 37, 9}, { 19, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \ + { 57, 9}, { 31, 8}, { 63, 9}, { 43,10}, \ { 23, 9}, { 59,10}, { 31, 9}, { 71,10}, \ - { 39, 9}, { 87,10}, { 47, 9}, { 99,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \ { 55,11}, { 31,10}, { 87,11}, { 47,10}, \ { 111,12}, { 31,11}, { 63,10}, { 143,11}, \ { 79,10}, { 167,11}, { 95,10}, { 191,11}, \ { 111,12}, { 63,11}, { 143,10}, { 287, 9}, \ { 575,10}, { 303,11}, { 159,10}, { 319,12}, \ { 95,11}, { 191,10}, { 399,11}, { 207,10}, \ - { 431,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ - { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ - {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } -#define MUL_FFT_TABLE3_SIZE 76 + { 431,13}, { 63,12}, { 127,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 303,12}, \ + { 159,11}, { 335,10}, { 671,11}, { 367,12}, \ + { 191,11}, { 399,10}, { 799,11}, { 431,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 671,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 415,11}, { 863,12}, { 447,14}, { 127,13}, \ + { 255,12}, { 607,13}, { 319,12}, { 735,13}, \ + { 383,12}, { 799,11}, { 1599,12}, { 863,13}, \ + { 447,12}, { 927,11}, { 1855,14}, { 255,13}, \ + { 511,12}, { 1055,13}, { 575,12}, { 1215,13}, \ + { 639,12}, { 1279,13}, { 703,14}, { 383,13}, \ + { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \ + { 895,12}, { 1791,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \ + { 1599,12}, { 3199,13}, { 1663,14}, { 895,13}, \ + { 1855,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,14}, { 1279,13}, { 2687,14}, \ + { 1407,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,14}, { 1791,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 155 #define MUL_FFT_THRESHOLD 5760 -#define SQR_FFT_MODF_THRESHOLD 440 /* k = 5 */ +#define SQR_FFT_MODF_THRESHOLD 436 /* k = 5 */ #define SQR_FFT_TABLE3 \ - { { 440, 5}, { 14, 4}, { 29, 5}, { 29, 6}, \ - { 15, 5}, { 31, 6}, { 35, 7}, { 18, 6}, \ - { 37, 7}, { 33, 8}, { 17, 7}, { 37, 8}, \ + { { 436, 5}, { 14, 4}, { 29, 5}, { 31, 6}, \ + { 35, 7}, { 18, 6}, { 37, 7}, { 37, 8}, \ { 19, 7}, { 40, 8}, { 37, 9}, { 19, 8}, \ { 43, 9}, { 23, 8}, { 49, 9}, { 27, 8}, \ { 57, 9}, { 43,10}, { 23, 9}, { 55,10}, \ @@ -93,45 +117,69 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ { 87,11}, { 47,10}, { 111,12}, { 31,11}, \ { 63,10}, { 135,11}, { 79,10}, { 167,11}, \ { 95,10}, { 191,11}, { 111,12}, { 63,11}, \ - { 127,10}, { 255,11}, { 143,10}, { 303,11}, \ - { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ - { 399,11}, { 207,10}, { 431,13}, { 8192,14}, \ - { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ - { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ - {4194304,23}, {8388608,24} } -#define SQR_FFT_TABLE3_SIZE 66 + { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319,12}, \ + { 95,11}, { 191,10}, { 399,11}, { 207,10}, \ + { 431,13}, { 63,12}, { 127,11}, { 271,10}, \ + { 543,11}, { 303,12}, { 159,11}, { 335,10}, \ + { 671,11}, { 367,10}, { 735,12}, { 191,11}, \ + { 399,10}, { 799,11}, { 431,12}, { 223,11}, \ + { 463,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 607,12}, { 319,11}, { 671,12}, \ + { 351,11}, { 735,13}, { 191,12}, { 383,11}, \ + { 799,12}, { 415,11}, { 863,12}, { 447,11}, \ + { 895,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1087,12}, { 607,13}, { 319,12}, { 735,13}, \ + { 383,12}, { 863,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1183,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1919,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \ + { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,14}, { 1791,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 151 #define SQR_FFT_THRESHOLD 4032 #define MULLO_BASECASE_THRESHOLD 29 #define MULLO_DC_THRESHOLD 57 #define MULLO_MUL_N_THRESHOLD 11278 -#define DC_DIV_QR_THRESHOLD 59 +#define DC_DIV_QR_THRESHOLD 64 #define DC_DIVAPPR_Q_THRESHOLD 222 #define DC_BDIV_QR_THRESHOLD 95 #define DC_BDIV_Q_THRESHOLD 264 -#define INV_MULMOD_BNM1_THRESHOLD 82 -#define INV_NEWTON_THRESHOLD 11 -#define INV_APPR_THRESHOLD 18 +#define INV_MULMOD_BNM1_THRESHOLD 86 +#define INV_NEWTON_THRESHOLD 139 +#define INV_APPR_THRESHOLD 147 #define BINV_NEWTON_THRESHOLD 252 -#define REDC_1_TO_REDC_2_THRESHOLD 0 +#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */ #define REDC_2_TO_REDC_N_THRESHOLD 147 #define MU_DIV_QR_THRESHOLD 1142 -#define MU_DIVAPPR_Q_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 1142 #define MUPI_DIV_QR_THRESHOLD 0 /* always */ -#define MU_BDIV_QR_THRESHOLD 1187 +#define MU_BDIV_QR_THRESHOLD 1210 #define MU_BDIV_Q_THRESHOLD 1470 #define MATRIX22_STRASSEN_THRESHOLD 23 #define HGCD_THRESHOLD 117 -#define GCD_DC_THRESHOLD 469 +#define HGCD_APPR_THRESHOLD 111 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 555 #define GCDEXT_DC_THRESHOLD 368 #define JACOBI_BASE_METHOD 4 #define GET_STR_DC_THRESHOLD 13 -#define GET_STR_PRECOMPUTE_THRESHOLD 21 -#define SET_STR_DC_THRESHOLD 1204 -#define SET_STR_PRECOMPUTE_THRESHOLD 3266 +#define GET_STR_PRECOMPUTE_THRESHOLD 22 +#define SET_STR_DC_THRESHOLD 1474 +#define SET_STR_PRECOMPUTE_THRESHOLD 3168 diff --git a/mpn/ia64/tabselect.asm b/mpn/ia64/tabselect.asm new file mode 100644 index 000000000..cc5b49b04 --- /dev/null +++ b/mpn/ia64/tabselect.asm @@ -0,0 +1,139 @@ +dnl IA-64 mpn_tabselect. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 2.5 + +C NOTES +C * Using software pipelining could trivially yield 2 c/l without unrolling, +C or 1+epsilon with unrolling. (This code was modelled after the powerpc64 +C code, for simplicity.) + +C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `r32') +define(`tp', `r33') +define(`n', `r34') +define(`nents', `r35') +define(`which', `r36') + +define(`mask', `r8') + +define(`rp1', `r32') +define(`tp1', `r33') +define(`rp2', `r14') +define(`tp2', `r15') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_tabselect) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` +.mmi; addp4 rp = 0, rp C M I + addp4 tp = 0, tp C M I + zxt4 n = n C I +.mii; nop 0 + zxt4 nents = nents C I + zxt4 which = which C I + ;; +') +.mmi; add rp2 = 8, rp1 + add tp2 = 8, tp1 + add r6 = -2, n + ;; +.mmi; cmp.eq p10, p0 = 1, n + and r9 = 1, n C set cr0 for use in inner loop + shr.u r6 = r6, 1 C inner loop count + ;; +.mmi; cmp.eq p8, p0 = 0, r9 + sub which = nents, which + shl n = n, 3 + ;; + +L(outer): +.mmi cmp.eq p6, p7 = which, nents C are we at the selected table entry? + nop 0 + mov ar.lc = r6 C I0 + ;; +.mmb; + (p6) mov mask = -1 + (p7) mov mask = 0 + (p8) br.dptk L(top) C branch to loop entry if n even + ;; + +.mmi; ld8 r16 = [tp1], 8 + add tp2 = 8, tp2 + nop 0 + ;; +.mmi; ld8 r18 = [rp1] + and r16 = r16, mask + nop 0 + ;; +.mmi; andcm r18 = r18, mask + ;; + or r16 = r16, r18 + nop 0 + ;; +.mmb; st8 [rp1] = r16, 8 + add rp2 = 8, rp2 + (p10) br.dpnt L(end) + + ALIGN(32) +L(top): +.mmi; ld8 r16 = [tp1], 16 + ld8 r17 = [tp2], 16 + nop 0 + ;; +.mmi; ld8 r18 = [rp1] + and r16 = r16, mask + nop 0 +.mmi; ld8 r19 = [rp2] + and r17 = r17, mask + nop 0 + ;; +.mmi; andcm r18 = r18, mask + andcm r19 = r19, mask + nop 0 + ;; +.mmi; or r16 = r16, r18 + or r17 = r17, r19 + nop 0 + ;; +.mmb; st8 [rp1] = r16, 16 + st8 [rp2] = r17, 16 + br.cloop.dptk L(top) + ;; +L(end): +.mmi; sub rp1 = rp1, n C move rp back to beginning + sub rp2 = rp2, n C move rp back to beginning + cmp.ne p9, p0 = 1, nents +.mmb; add nents = -1, nents + nop 0 + (p9) br.dptk L(outer) + ;; + +.mib; nop 0 + nop 0 + br.ret.sptk.many b0 +EPILOGUE() |