diff options
-rw-r--r-- | ChangeLog | 9 | ||||
-rw-r--r-- | configure.in | 3 | ||||
-rw-r--r-- | gmp-impl.h | 3 | ||||
-rw-r--r-- | mpn/asm-defs.m4 | 1 | ||||
-rw-r--r-- | mpn/x86_64/addaddmul_1msb0.asm | 155 |
5 files changed, 170 insertions, 1 deletions
@@ -1,3 +1,12 @@ +2008-09-22 Niels Möller <nisse@lysator.liu.se> <nisse@king.swox.se> + + * gmp-impl.h: Declare mpn_addaddmul_1msb0. + * mpn/asm-defs.m4: Added addaddmul_1msb0. + * mpn/x86_64/addaddmul_1msb0.asm: New file. + * configure.in (gmp_mpn_functions_optional): Added + addaddmul_1msb0. + (HAVE_NATIVE): List addaddmul_1msb0. + 2008-09-21 Torbjorn Granlund <tege@swox.com> * mpn/generic/get_str.c (GET_STR_DC_THRESHOLD): Remove default. diff --git a/configure.in b/configure.in index 7eea50ee4..bc204bb11 100644 --- a/configure.in +++ b/configure.in @@ -2399,7 +2399,7 @@ gmp_mpn_functions_optional="umul udiv copyi copyd com_n gcd_finda invert_limb sqr_diagonal \ mul_2 mul_3 mul_4 \ addmul_2 addmul_3 addmul_4 addmul_5 addmul_6 addmul_7 addmul_8 \ - addlsh1_n sublsh1_n rsh1add_n rsh1sub_n addsub_n lshiftc" + addlsh1_n sublsh1_n rsh1add_n rsh1sub_n addsub_n addaddmul_1msb0 lshiftc" gmp_mpn_functions="$extra_functions \ add add_1 add_n sub sub_1 sub_n mul_1 addmul_1 \ @@ -2865,6 +2865,7 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_mpn_addmul_7 #undef HAVE_NATIVE_mpn_addmul_8 #undef HAVE_NATIVE_mpn_addsub_n +#undef HAVE_NATIVE_mpn_addaddmul_1msb0 #undef HAVE_NATIVE_mpn_and_n #undef HAVE_NATIVE_mpn_andn_n #undef HAVE_NATIVE_mpn_bdiv_dbm1c diff --git a/gmp-impl.h b/gmp-impl.h index 4dcfc6497..271ddb9ec 100644 --- a/gmp-impl.h +++ b/gmp-impl.h @@ -826,6 +826,9 @@ __GMP_DECLSPEC mp_limb_t mpn_addsub_n __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, m #define mpn_addsub_nc __MPN(addsub_nc) __GMP_DECLSPEC mp_limb_t mpn_addsub_nc __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t)); +#define mpn_addaddmul_1msb0 __MPN(addaddmul_1msb0) +__GMP_DECLSPEC mp_limb_t mpn_addaddmul_1msb0 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t)); + #define mpn_divrem_1c __MPN(divrem_1c) __GMP_DECLSPEC mp_limb_t mpn_divrem_1c __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t)); diff --git a/mpn/asm-defs.m4 b/mpn/asm-defs.m4 index b89dc01d6..3c09f94f7 100644 --- a/mpn/asm-defs.m4 +++ b/mpn/asm-defs.m4 @@ -1310,6 +1310,7 @@ define_mpn(addmul_3) define_mpn(addmul_4) define_mpn(addsub_n) define_mpn(addsub_nc) +define_mpn(addaddmul_1msb0) define_mpn(and_n) define_mpn(andn_n) define_mpn(bdiv_dbm1c) diff --git a/mpn/x86_64/addaddmul_1msb0.asm b/mpn/x86_64/addaddmul_1msb0.asm new file mode 100644 index 000000000..89e7bed98 --- /dev/null +++ b/mpn/x86_64/addaddmul_1msb0.asm @@ -0,0 +1,155 @@ +dnl AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C K8: 2.167 +C P4: 12.0 +C P6-15: 4.0 + +C TODO +C * Perhaps handle various n mod 3 sizes better. The code now is too large. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`bp_param', `%rdx') +define(`n', `%rcx') +define(`u0', `%r8') +define(`v0', `%r9') + + +define(`bp', `%rbp') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addaddmul_1msb0) + push %r12 + push %rbp + + lea (ap,n,8), ap + lea (bp_param,n,8), bp + lea (rp,n,8), rp + neg n + + mov (ap,n,8), %rax + mul %r8 + mov %rax, %r12 + mov (bp,n,8), %rax + mov %rdx, %r10 + add $3, n + jns L(end) + + ALIGN(16) +L(top): mul %r9 + add %rax, %r12 + mov -16(ap,n,8), %rax + adc %rdx, %r10 + mov %r12, -24(rp,n,8) + mul %r8 + add %rax, %r10 + mov -16(bp,n,8), %rax + mov $0, %r11d + adc %rdx, %r11 + mul %r9 + add %rax, %r10 + mov -8(ap,n,8), %rax + adc %rdx, %r11 + mov %r10, -16(rp,n,8) + mul %r8 + add %rax, %r11 + mov -8(bp,n,8), %rax + mov $0, %r12d + adc %rdx, %r12 + mul %r9 + add %rax, %r11 + adc %rdx, %r12 + mov (ap,n,8), %rax + mul %r8 + add %rax, %r12 + mov %r11, -8(rp,n,8) + mov (bp,n,8), %rax + mov $0, %r10d + adc %rdx, %r10 + add $3, n + js L(top) + +L(end): cmp $1, R32(n) + ja 2f + jz 1f + + mul %r9 + add %rax, %r12 + mov -16(ap), %rax + adc %rdx, %r10 + mov %r12, -24(rp) + mul %r8 + add %rax, %r10 + mov -16(bp), %rax + mov $0, %r11d + adc %rdx, %r11 + mul %r9 + add %rax, %r10 + mov -8(ap), %rax + adc %rdx, %r11 + mov %r10, -16(rp) + mul %r8 + add %rax, %r11 + mov -8(bp), %rax + mov $0, %r12d + adc %rdx, %r12 + mul %r9 + add %rax, %r11 + adc %rdx, %r12 + mov %r11, -8(rp) + mov %r12, %rax + pop %rbp + pop %r12 + ret + +1: mul %r9 + add %rax, %r12 + mov -8(ap), %rax + adc %rdx, %r10 + mov %r12, -16(rp) + mul %r8 + add %rax, %r10 + mov -8(bp), %rax + mov $0, %r11d + adc %rdx, %r11 + mul %r9 + add %rax, %r10 + adc %rdx, %r11 + mov %r10, -8(rp) + mov %r11, %rax + pop %rbp + pop %r12 + ret + +2: mul %r9 + add %rax, %r12 + mov %r12, -8(rp) + adc %rdx, %r10 + mov %r10, %rax + pop %rbp + pop %r12 + ret +EPILOGUE() |