summaryrefslogtreecommitdiff
path: root/mpn/x86/atom/sse2
diff options
context:
space:
mode:
authorTorbjorn Granlund <tege@gmplib.org>2011-02-27 00:29:34 +0100
committerTorbjorn Granlund <tege@gmplib.org>2011-02-27 00:29:34 +0100
commit118b7712e0c45cc42a340f35b2e2fc9bef081410 (patch)
treecb0a101aee904d90e12c58027f49f2aa077704bc /mpn/x86/atom/sse2
parentd3868294bd1e9daff0134161ea0bb4f7a0fbfba4 (diff)
downloadgmp-118b7712e0c45cc42a340f35b2e2fc9bef081410.tar.gz
New file.
Diffstat (limited to 'mpn/x86/atom/sse2')
-rw-r--r--mpn/x86/atom/sse2/mul_basecase.asm530
1 files changed, 530 insertions, 0 deletions
diff --git a/mpn/x86/atom/sse2/mul_basecase.asm b/mpn/x86/atom/sse2/mul_basecase.asm
new file mode 100644
index 000000000..4fbf1876b
--- /dev/null
+++ b/mpn/x86/atom/sse2/mul_basecase.asm
@@ -0,0 +1,530 @@
+dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in
+dnl a third limb vector.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+dnl
+dnl Copyright 2011 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
+C 4 large loops into one; we could use it for the outer loop branch.
+C * Optimise code outside if inner loops.
+C * Play with rp and up offsets to save a bunch of lea insns.
+C * Write combined addmul_1 feed-in a wind-down code, and use when iterating
+C outer each loop. ("Overlapping software pipelining")
+C * Postpone push of ebx until we know vn > 1.
+C * Perhaps write special code for un < M, for some small M.
+C * Replace addmul_1 loop by less pipelined loop. This could save perhaps 25%
+C of the code size.
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xn,
+C mp_srcptr yp, mp_size_t yn);
+C
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`un', `%ecx')
+define(`vp', `%ebp')
+define(`vn', `36(%esp)')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+ mov 20(%esp), rp
+ mov 24(%esp), up
+ mov 28(%esp), un
+ mov 32(%esp), vp
+
+ movd (up), %mm0
+ movd (vp), %mm7
+ pmuludq %mm7, %mm0
+ pxor %mm6, %mm6
+
+ mov un, %eax
+ and $3, %eax
+ jz L(of0)
+ cmp $2, %eax
+ jc L(of1)
+ jz L(of2)
+
+C ================================================================
+ jmp L(m3)
+ ALIGN(16)
+L(lm3): movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(m3): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, 8(rp)
+ lea 16(up), up
+ ja L(lm3)
+
+ psrlq $32, %mm6
+ movd %mm6, 12(rp)
+
+ decl vn
+ jz L(done)
+
+L(ol3): lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 20(%esp), rp
+ mov 24(%esp), up
+ lea 4(rp), rp
+ mov rp, 20(%esp)
+ mov 28(%esp), un
+
+ movd (up), %mm1
+ pmuludq %mm7, %mm1
+ shr $2, un C FIXME: move out
+ lea 4(up), up
+ lea -12(rp), rp
+ movd %mm1, %ebx
+ inc un
+ movd (up), %mm0
+ xor %edx, %edx C zero edx and CF
+ jmp L(a3)
+
+L(la3): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+L(a3): psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ dec un
+ movd 4(up), %mm1
+ jnz L(la3)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ decl vn
+ jnz L(ol3)
+ jmp L(done)
+
+C ================================================================
+ ALIGN(16)
+L(lm0): movd (up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+L(of0): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 12(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, 12(rp)
+ lea 16(up), up
+ ja L(lm0)
+
+ psrlq $32, %mm6
+ movd %mm6, 16(rp)
+
+ decl vn
+ jz L(done)
+
+L(ol0): lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 20(%esp), rp
+ mov 24(%esp), up
+ lea 4(rp), rp
+ mov rp, 20(%esp)
+ mov 28(%esp), un
+
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ shr $2, un C FIXME: move out
+ movd 4(up), %mm1
+ lea -8(up), up
+ lea -8(rp), rp
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ xor %edx, %edx C zero edx and CF
+ jmp L(a0)
+
+L(la0): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+L(a0): psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ dec un
+ movd 4(up), %mm1
+ jnz L(la0)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ decl vn
+ jnz L(ol0)
+ jmp L(done)
+
+C ================================================================
+ ALIGN(16)
+L(lm1): movd -12(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -12(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of1): paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, (rp)
+ lea 16(up), up
+ ja L(lm1)
+
+ psrlq $32, %mm6
+ movd %mm6, 4(rp)
+
+ decl vn
+ jz L(done)
+
+L(ol1): lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 20(%esp), rp
+ mov 24(%esp), up
+ lea 4(rp), rp
+ mov rp, 20(%esp)
+ mov 28(%esp), un
+
+ movd (up), %mm1
+ pmuludq %mm7, %mm1
+ shr $2, un C FIXME: move out
+
+ lea -4(rp), rp
+ movd %mm1, %ebx
+ movd 4(up), %mm0
+ lea -4(up), up
+ pmuludq %mm7, %mm0
+ xor %edx, %edx C zero edx and CF
+ jmp L(a1)
+
+L(la1): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+L(a1): psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ dec un
+ movd 4(up), %mm1
+ jnz L(la1)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ decl vn
+ jnz L(ol1)
+ jmp L(done)
+
+C ================================================================
+ ALIGN(16)
+L(lm2): movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of2): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, 4(rp)
+ lea 16(up), up
+ ja L(lm2)
+
+ psrlq $32, %mm6
+ movd %mm6, 8(rp)
+
+ decl vn
+ jz L(done)
+
+L(ol2): lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 20(%esp), rp
+ mov 24(%esp), up
+ lea 4(rp), rp
+ mov rp, 20(%esp)
+ mov 28(%esp), un
+
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ xor %edx, %edx
+ shr $2, un C FIXME: move out
+ movd 4(up), %mm1
+ test un, un C clear carry
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ jnz L(a2)
+ jmp L(wd2)
+
+L(la2): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+L(a2): psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ dec un
+ movd 4(up), %mm1
+ jnz L(la2)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+L(wd2): psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ decl vn
+ jnz L(ol2)
+C jmp L(done)
+
+C ================================================================
+L(done):
+ emms
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()