summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog9
-rw-r--r--configure.in3
-rw-r--r--gmp-impl.h3
-rw-r--r--mpn/asm-defs.m41
-rw-r--r--mpn/x86_64/addaddmul_1msb0.asm155
5 files changed, 170 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 79f9b13e2..ceabbbfb7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2008-09-22 Niels Möller <nisse@lysator.liu.se> <nisse@king.swox.se>
+
+ * gmp-impl.h: Declare mpn_addaddmul_1msb0.
+ * mpn/asm-defs.m4: Added addaddmul_1msb0.
+ * mpn/x86_64/addaddmul_1msb0.asm: New file.
+ * configure.in (gmp_mpn_functions_optional): Added
+ addaddmul_1msb0.
+ (HAVE_NATIVE): List addaddmul_1msb0.
+
2008-09-21 Torbjorn Granlund <tege@swox.com>
* mpn/generic/get_str.c (GET_STR_DC_THRESHOLD): Remove default.
diff --git a/configure.in b/configure.in
index 7eea50ee4..bc204bb11 100644
--- a/configure.in
+++ b/configure.in
@@ -2399,7 +2399,7 @@ gmp_mpn_functions_optional="umul udiv copyi copyd com_n
gcd_finda invert_limb sqr_diagonal \
mul_2 mul_3 mul_4 \
addmul_2 addmul_3 addmul_4 addmul_5 addmul_6 addmul_7 addmul_8 \
- addlsh1_n sublsh1_n rsh1add_n rsh1sub_n addsub_n lshiftc"
+ addlsh1_n sublsh1_n rsh1add_n rsh1sub_n addsub_n addaddmul_1msb0 lshiftc"
gmp_mpn_functions="$extra_functions \
add add_1 add_n sub sub_1 sub_n mul_1 addmul_1 \
@@ -2865,6 +2865,7 @@ AH_VERBATIM([HAVE_NATIVE],
#undef HAVE_NATIVE_mpn_addmul_7
#undef HAVE_NATIVE_mpn_addmul_8
#undef HAVE_NATIVE_mpn_addsub_n
+#undef HAVE_NATIVE_mpn_addaddmul_1msb0
#undef HAVE_NATIVE_mpn_and_n
#undef HAVE_NATIVE_mpn_andn_n
#undef HAVE_NATIVE_mpn_bdiv_dbm1c
diff --git a/gmp-impl.h b/gmp-impl.h
index 4dcfc6497..271ddb9ec 100644
--- a/gmp-impl.h
+++ b/gmp-impl.h
@@ -826,6 +826,9 @@ __GMP_DECLSPEC mp_limb_t mpn_addsub_n __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, m
#define mpn_addsub_nc __MPN(addsub_nc)
__GMP_DECLSPEC mp_limb_t mpn_addsub_nc __GMP_PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
+#define mpn_addaddmul_1msb0 __MPN(addaddmul_1msb0)
+__GMP_DECLSPEC mp_limb_t mpn_addaddmul_1msb0 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
+
#define mpn_divrem_1c __MPN(divrem_1c)
__GMP_DECLSPEC mp_limb_t mpn_divrem_1c __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
diff --git a/mpn/asm-defs.m4 b/mpn/asm-defs.m4
index b89dc01d6..3c09f94f7 100644
--- a/mpn/asm-defs.m4
+++ b/mpn/asm-defs.m4
@@ -1310,6 +1310,7 @@ define_mpn(addmul_3)
define_mpn(addmul_4)
define_mpn(addsub_n)
define_mpn(addsub_nc)
+define_mpn(addaddmul_1msb0)
define_mpn(and_n)
define_mpn(andn_n)
define_mpn(bdiv_dbm1c)
diff --git a/mpn/x86_64/addaddmul_1msb0.asm b/mpn/x86_64/addaddmul_1msb0.asm
new file mode 100644
index 000000000..89e7bed98
--- /dev/null
+++ b/mpn/x86_64/addaddmul_1msb0.asm
@@ -0,0 +1,155 @@
+dnl AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C K8: 2.167
+C P4: 12.0
+C P6-15: 4.0
+
+C TODO
+C * Perhaps handle various n mod 3 sizes better. The code now is too large.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`bp_param', `%rdx')
+define(`n', `%rcx')
+define(`u0', `%r8')
+define(`v0', `%r9')
+
+
+define(`bp', `%rbp')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_addaddmul_1msb0)
+ push %r12
+ push %rbp
+
+ lea (ap,n,8), ap
+ lea (bp_param,n,8), bp
+ lea (rp,n,8), rp
+ neg n
+
+ mov (ap,n,8), %rax
+ mul %r8
+ mov %rax, %r12
+ mov (bp,n,8), %rax
+ mov %rdx, %r10
+ add $3, n
+ jns L(end)
+
+ ALIGN(16)
+L(top): mul %r9
+ add %rax, %r12
+ mov -16(ap,n,8), %rax
+ adc %rdx, %r10
+ mov %r12, -24(rp,n,8)
+ mul %r8
+ add %rax, %r10
+ mov -16(bp,n,8), %rax
+ mov $0, %r11d
+ adc %rdx, %r11
+ mul %r9
+ add %rax, %r10
+ mov -8(ap,n,8), %rax
+ adc %rdx, %r11
+ mov %r10, -16(rp,n,8)
+ mul %r8
+ add %rax, %r11
+ mov -8(bp,n,8), %rax
+ mov $0, %r12d
+ adc %rdx, %r12
+ mul %r9
+ add %rax, %r11
+ adc %rdx, %r12
+ mov (ap,n,8), %rax
+ mul %r8
+ add %rax, %r12
+ mov %r11, -8(rp,n,8)
+ mov (bp,n,8), %rax
+ mov $0, %r10d
+ adc %rdx, %r10
+ add $3, n
+ js L(top)
+
+L(end): cmp $1, R32(n)
+ ja 2f
+ jz 1f
+
+ mul %r9
+ add %rax, %r12
+ mov -16(ap), %rax
+ adc %rdx, %r10
+ mov %r12, -24(rp)
+ mul %r8
+ add %rax, %r10
+ mov -16(bp), %rax
+ mov $0, %r11d
+ adc %rdx, %r11
+ mul %r9
+ add %rax, %r10
+ mov -8(ap), %rax
+ adc %rdx, %r11
+ mov %r10, -16(rp)
+ mul %r8
+ add %rax, %r11
+ mov -8(bp), %rax
+ mov $0, %r12d
+ adc %rdx, %r12
+ mul %r9
+ add %rax, %r11
+ adc %rdx, %r12
+ mov %r11, -8(rp)
+ mov %r12, %rax
+ pop %rbp
+ pop %r12
+ ret
+
+1: mul %r9
+ add %rax, %r12
+ mov -8(ap), %rax
+ adc %rdx, %r10
+ mov %r12, -16(rp)
+ mul %r8
+ add %rax, %r10
+ mov -8(bp), %rax
+ mov $0, %r11d
+ adc %rdx, %r11
+ mul %r9
+ add %rax, %r10
+ adc %rdx, %r11
+ mov %r10, -8(rp)
+ mov %r11, %rax
+ pop %rbp
+ pop %r12
+ ret
+
+2: mul %r9
+ add %rax, %r12
+ mov %r12, -8(rp)
+ adc %rdx, %r10
+ mov %r10, %rax
+ pop %rbp
+ pop %r12
+ ret
+EPILOGUE()