diff options
Diffstat (limited to 'mpn/powerpc64/mode64/p6')
-rw-r--r-- | mpn/powerpc64/mode64/p6/aorsmul_1.asm | 172 | ||||
-rw-r--r-- | mpn/powerpc64/mode64/p6/gmp-mparam.h | 85 | ||||
-rw-r--r-- | mpn/powerpc64/mode64/p6/mul_basecase.asm | 2 |
3 files changed, 219 insertions, 40 deletions
diff --git a/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/mpn/powerpc64/mode64/p6/aorsmul_1.asm new file mode 100644 index 000000000..4bd508488 --- /dev/null +++ b/mpn/powerpc64/mode64/p6/aorsmul_1.asm @@ -0,0 +1,172 @@ +dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6. + +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 +dnl Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 ? ? +C POWER4/PPC970 ? ? +C POWER5 ? ? +C POWER6 12.25 12.8 +C POWER7 ? ? + +C TODO +C * Reduce register usage. +C * Schedule function entry code. +C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling +C would bring us to 9 c/l. +C * Handle n = 1 and perhaps n = 2 seperately, without saving any registers. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ifdef(`OPERATION_addmul_1',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_addmul_1) + define(func_nc, mpn_addmul_1c) C FIXME: not really supported + define(AM, `$1') + define(SM, `') + define(CLRRSC, `addic $1, r0, 0') +') +ifdef(`OPERATION_submul_1',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_submul_1) + define(func_nc, mpn_submul_1c) C FIXME: not really supported + define(AM, `') + define(SM, `$1') + define(CLRRSC, `subfc $1, r0, r0') +') + +ASM_START() +PROLOGUE(func) + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 3 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C copy loop count into ctr + beq cr0, L(b0) + blt cr6, L(b1) + beq cr6, L(b2) + +L(b3): ld r8, 0(up) + ld r7, 8(up) + ld r27, 16(up) + addi up, up, 16 + addi rp, rp, 16 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r29, -16(rp) + ld r30, -8(rp) + ld r31, 0(rp) + addc r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + ADDSUB r5, r5, r29 + b L(l3) + +L(b2): ld r7, 0(up) + ld r27, 8(up) + addi up, up, 8 + addi rp, rp, 8 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r30, -8(rp) + ld r31, 0(rp) + addc r11, r11, r7 + addze r12, r27 + ADDSUB r9, r9, r30 + b L(l2) + +L(b1): ld r27, 0(up) + ld r31, 0(rp) + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ADDSUB r11, r11, r31 + b L(l1) + +L(b0): addi up, up, -8 + addi rp, rp, -8 + CLRRSC( r12) C clear r12 and clr/set cy + + ALIGN(32) +L(top): +SM(` subfe r11, r0, r0') C complement... +SM(` addic r11, r11, 1') C ...carry flag + ld r10, 8(up) + ld r8, 16(up) + ld r7, 24(up) + ld r27, 32(up) + addi up, up, 32 + addi rp, rp, 32 + mulld r0, r10, v0 + mulhdu r10, r10, v0 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r28, -24(rp) + adde r0, r0, r12 + ld r29, -16(rp) + adde r5, r5, r10 + ld r30, -8(rp) + ld r31, 0(rp) + adde r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + ADDSUB r0, r0, r28 + std r0, -24(rp) + ADDSUBC r5, r5, r29 +L(l3): std r5, -16(rp) + ADDSUBC r9, r9, r30 +L(l2): std r9, -8(rp) + ADDSUBC r11, r11, r31 +L(l1): std r11, 0(rp) + bdnz L(top) + +AM(` addze r3, r12') +SM(` subfe r11, r0, r0') C complement... + ld r31, -8(r1) +SM(` subf r3, r11, r12') + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + blr +EPILOGUE() diff --git a/mpn/powerpc64/mode64/p6/gmp-mparam.h b/mpn/powerpc64/mode64/p6/gmp-mparam.h index d447b56d9..5392138f1 100644 --- a/mpn/powerpc64/mode64/p6/gmp-mparam.h +++ b/mpn/powerpc64/mode64/p6/gmp-mparam.h @@ -1,7 +1,7 @@ -/* gmp-mparam.h -- Compiler/machine parameter header file. +/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file. -Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010 Free -Software Foundation, Inc. +Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002, 2003, 2009, 2010, 2011 +Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -31,6 +31,7 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ #define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5 #define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ #define DIVEXACT_1_THRESHOLD 0 /* always (native) */ #define BMOD_1_TO_MOD_1_THRESHOLD 21 @@ -38,23 +39,27 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define MUL_TOOM33_THRESHOLD 50 #define MUL_TOOM44_THRESHOLD 112 #define MUL_TOOM6H_THRESHOLD 274 -#define MUL_TOOM8H_THRESHOLD 430 +#define MUL_TOOM8H_THRESHOLD 339 #define MUL_TOOM32_TO_TOOM43_THRESHOLD 62 -#define MUL_TOOM32_TO_TOOM53_THRESHOLD 84 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76 #define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 -#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 78 -#define SQR_BASECASE_THRESHOLD 9 -#define SQR_TOOM2_THRESHOLD 30 -#define SQR_TOOM3_THRESHOLD 53 -#define SQR_TOOM4_THRESHOLD 148 +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 49 +#define SQR_TOOM4_THRESHOLD 136 #define SQR_TOOM6_THRESHOLD 226 -#define SQR_TOOM8_THRESHOLD 430 +#define SQR_TOOM8_THRESHOLD 393 + +#define MULMID_TOOM42_THRESHOLD 36 #define MULMOD_BNM1_THRESHOLD 14 #define SQRMOD_BNM1_THRESHOLD 14 +#define POWM_SEC_TABLE 4,23,213,840,2618 + #define MUL_FFT_MODF_THRESHOLD 340 /* k = 5 */ #define MUL_FFT_TABLE3 \ { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ @@ -106,34 +111,36 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #define SQR_FFT_THRESHOLD 2368 #define MULLO_BASECASE_THRESHOLD 5 -#define MULLO_DC_THRESHOLD 28 -#define MULLO_MUL_N_THRESHOLD 6633 - -#define DC_DIV_QR_THRESHOLD 27 -#define DC_DIVAPPR_Q_THRESHOLD 112 -#define DC_BDIV_QR_THRESHOLD 29 -#define DC_BDIV_Q_THRESHOLD 86 - -#define INV_MULMOD_BNM1_THRESHOLD 47 -#define INV_NEWTON_THRESHOLD 93 -#define INV_APPR_THRESHOLD 91 - -#define BINV_NEWTON_THRESHOLD 132 -#define REDC_1_TO_REDC_N_THRESHOLD 39 - -#define MU_DIV_QR_THRESHOLD 855 -#define MU_DIVAPPR_Q_THRESHOLD 807 -#define MUPI_DIV_QR_THRESHOLD 33 -#define MU_BDIV_QR_THRESHOLD 807 -#define MU_BDIV_Q_THRESHOLD 872 - -#define MATRIX22_STRASSEN_THRESHOLD 11 -#define HGCD_THRESHOLD 64 -#define GCD_DC_THRESHOLD 237 -#define GCDEXT_DC_THRESHOLD 183 +#define MULLO_DC_THRESHOLD 61 +#define MULLO_MUL_N_THRESHOLD 3271 + +#define DC_DIV_QR_THRESHOLD 59 +#define DC_DIVAPPR_Q_THRESHOLD 200 +#define DC_BDIV_QR_THRESHOLD 70 +#define DC_BDIV_Q_THRESHOLD 168 + +#define INV_MULMOD_BNM1_THRESHOLD 61 +#define INV_NEWTON_THRESHOLD 166 +#define INV_APPR_THRESHOLD 166 + +#define BINV_NEWTON_THRESHOLD 222 +#define REDC_1_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 59 +#define MU_BDIV_QR_THRESHOLD 889 +#define MU_BDIV_Q_THRESHOLD 1078 + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 109 +#define HGCD_APPR_THRESHOLD 108 +#define HGCD_REDUCE_THRESHOLD 1052 +#define GCD_DC_THRESHOLD 501 +#define GCDEXT_DC_THRESHOLD 249 #define JACOBI_BASE_METHOD 4 -#define GET_STR_DC_THRESHOLD 17 -#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 #define SET_STR_DC_THRESHOLD 532 -#define SET_STR_PRECOMPUTE_THRESHOLD 1648 +#define SET_STR_PRECOMPUTE_THRESHOLD 1639 diff --git a/mpn/powerpc64/mode64/p6/mul_basecase.asm b/mpn/powerpc64/mode64/p6/mul_basecase.asm index 427d6081a..52c5af8ff 100644 --- a/mpn/powerpc64/mode64/p6/mul_basecase.asm +++ b/mpn/powerpc64/mode64/p6/mul_basecase.asm @@ -1,4 +1,4 @@ -dnl PowerPC-64 mpn_basecase. +dnl PowerPC-64 mpn_mul_basecase. dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010 Free dnl Software Foundation, Inc. |