summaryrefslogtreecommitdiff
path: root/mpn/x86_64/addmul_2.asm
diff options
context:
space:
mode:
authortege <tege@gmplib.org>2007-09-03 17:32:05 +0200
committertege <tege@gmplib.org>2007-09-03 17:32:05 +0200
commit7d89ea43689920dd9962f5fdd41ccc5e615e4f86 (patch)
tree51fb6839aea96de0ebac1b44c0366066c03acdb6 /mpn/x86_64/addmul_2.asm
parent8e96ca77fa3d4dd2661bb74f10322a409058ac19 (diff)
downloadgmp-7d89ea43689920dd9962f5fdd41ccc5e615e4f86.tar.gz
*** empty log message ***
Diffstat (limited to 'mpn/x86_64/addmul_2.asm')
-rw-r--r--mpn/x86_64/addmul_2.asm239
1 files changed, 239 insertions, 0 deletions
diff --git a/mpn/x86_64/addmul_2.asm b/mpn/x86_64/addmul_2.asm
new file mode 100644
index 000000000..b42c627bf
--- /dev/null
+++ b/mpn/x86_64/addmul_2.asm
@@ -0,0 +1,239 @@
+dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb and add the
+dnl result to a third limb vector.
+
+dnl Copyright 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write
+dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+dnl Boston, MA 02110-1301, USA.
+
+include(`../config.m4')
+
+C cycles/limb
+C K8: 2.86
+C P4: 13.65 FIXME now it seems to magically have dropped to 15 c/l.
+C P6-15: 4.67
+
+C TODO
+C 1. Try scheduling mul last in loop, just like we do in addmul_1.
+C 2. Try more loop variants, play with this variant more. We reached the
+C current speed without much effort, surely it is not ultimate.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`vp', `%rcx')
+
+ TEXT
+ ALIGN(16)
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+
+ push %rbx
+ push %rbp
+ push %r12
+
+define(`vl', `%r9')
+define(`vh', `%r10')
+ mov (vp), vl
+ mov 8(vp), vh
+
+ mov n, %r11
+define(`n', `%r11')
+
+ lea (up,n,8), up
+ lea (rp,n,8), rp
+ neg n
+
+ xor %r8, %r8
+ xor %ebx, %ebx
+ xor %ecx, %ecx
+ xor %rbp, %rbp
+
+ mov (up,n,8), %r12
+ mov %r12, %rax
+ add $3, n
+ jns .Lend C <= 4 iterations
+
+ ALIGN(32)
+.Loop: mul vl
+ add %rax, %rbx
+ mov %r12, %rax
+ adc %rdx, %rcx
+ adc $0, %ebp
+ mul vh
+ add %rbx, -24(rp,n,8)
+ mov -16(up,n,8), %r12
+ mov %r8d, %ebx
+ adc %rax, %rcx
+ mov %r12, %rax
+ adc %rdx, %rbp
+
+ mul vl
+ add %rax, %rcx
+ mov %r12, %rax
+ adc %rdx, %rbp
+ adc $0, %ebx
+ mul vh
+ add %rcx, -16(rp,n,8)
+ mov -8(up,n,8), %r12
+ mov %r8d, %ecx
+ adc %rax, %rbp
+ mov %r12, %rax
+ adc %rdx, %rbx
+
+ mul vl
+ add %rax, %rbp
+ mov %r12, %rax
+ adc %rdx, %rbx
+ adc $0, %ecx
+ mul vh
+ add %rbp, -8(rp,n,8)
+ mov (up,n,8), %r12
+ mov %r8d, %ebp
+ adc %rax, %rbx
+ mov %r12, %rax
+ adc %rdx, %rcx
+
+ add $3, n
+ jns .Lend
+
+ mul vl
+ add %rax, %rbx
+ mov %r12, %rax
+ adc %rdx, %rcx
+ adc $0, %ebp
+ mul vh
+ add %rbx, -24(rp,n,8)
+ mov -16(up,n,8), %r12
+ mov %r8d, %ebx
+ adc %rax, %rcx
+ mov %r12, %rax
+ adc %rdx, %rbp
+
+ mul vl
+ add %rax, %rcx
+ mov %r12, %rax
+ adc %rdx, %rbp
+ adc $0, %ebx
+ mul vh
+ add %rcx, -16(rp,n,8)
+ mov -8(up,n,8), %r12
+ mov %r8d, %ecx
+ adc %rax, %rbp
+ mov %r12, %rax
+ adc %rdx, %rbx
+
+ mul vl
+ add %rax, %rbp
+ mov %r12, %rax
+ adc %rdx, %rbx
+ adc $0, %ecx
+ mul vh
+ add %rbp, -8(rp,n,8)
+ mov (up,n,8), %r12
+ mov %r8d, %ebp
+ adc %rax, %rbx
+ mov %r12, %rax
+ adc %rdx, %rcx
+
+ add $3, n
+ js .Loop
+
+.Lend: jne .Ln3
+ mul vl
+ add %rax, %rbx
+ mov %r12, %rax
+ adc %rdx, %rcx
+ adc $0, %ebp
+ mul vh
+ add %rbx, -24(rp)
+ mov -16(up), %r12
+ mov %r8d, %ebx
+ adc %rax, %rcx
+ mov %r12, %rax
+ adc %rdx, %rbp
+ mul vl
+ add %rax, %rcx
+ mov %r12, %rax
+ adc %rdx, %rbp
+ adc $0, %ebx
+ mul vh
+ add %rcx, -16(rp)
+ mov -8(up), %r12
+ mov %r8d, %ecx
+ adc %rax, %rbp
+ mov %r12, %rax
+ adc %rdx, %rbx
+ mul vl
+ add %rax, %rbp
+ mov %r12, %rax
+ adc %rdx, %rbx
+ adc $0, %ecx
+ mul vh
+ add %rbp, -8(rp)
+ adc %rax, %rbx
+ adc %rdx, %rcx
+ mov %rbx, (rp)
+ mov %rcx, %rax
+ jmp .Lret
+
+.Ln3: cmp $1, n
+ jne .Ln2
+ mul vl
+ add %rax, %rbx
+ mov %r12, %rax
+ adc %rdx, %rcx
+ adc $0, %ebp
+ mul vh
+ add %rbx, -16(rp)
+ mov -8(up), %r12
+ mov %r8d, %ebx
+ adc %rax, %rcx
+ mov %r12, %rax
+ adc %rdx, %rbp
+ mul vl
+ add %rax, %rcx
+ mov %r12, %rax
+ adc %rdx, %rbp
+ adc $0, %ebx
+ mul vh
+ add %rcx, -8(rp)
+ adc %rax, %rbp
+ adc %rdx, %rbx
+ mov %rbp, (rp)
+ mov %rbx, %rax
+ jmp .Lret
+
+.Ln2: mul vl
+ add %rax, %rbx
+ mov %r12, %rax
+ adc %rdx, %rcx
+ adc $0, %ebp
+ mul vh
+ add %rbx, -8(rp)
+ adc %rax, %rcx
+ adc %rdx, %rbp
+ mov %rcx, (rp)
+ mov %rbp, %rax
+
+.Lret: pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+
+EPILOGUE()