summaryrefslogtreecommitdiff
path: root/mpn
diff options
context:
space:
mode:
authortege <tege@gmplib.org>2007-02-05 00:23:40 +0100
committertege <tege@gmplib.org>2007-02-05 00:23:40 +0100
commit38a7b2792b3f6a27761afdd3b64f1a44f5a59d35 (patch)
treeca364c22b663b208a214fb32f678e5131b826f2b /mpn
parent8e5c94d42a79c18b73500d3223612e97db3bcf32 (diff)
downloadgmp-38a7b2792b3f6a27761afdd3b64f1a44f5a59d35.tar.gz
Complete rewrite.
Diffstat (limited to 'mpn')
-rw-r--r--mpn/x86/pentium4/sse2/addmul_1.asm275
1 files changed, 162 insertions, 113 deletions
diff --git a/mpn/x86/pentium4/sse2/addmul_1.asm b/mpn/x86/pentium4/sse2/addmul_1.asm
index a8c554874..46da9e12d 100644
--- a/mpn/x86/pentium4/sse2/addmul_1.asm
+++ b/mpn/x86/pentium4/sse2/addmul_1.asm
@@ -1,135 +1,184 @@
-dnl Intel Pentium-4 mpn_addmul_1 -- Multiply a limb vector with a limb and add
-dnl the result to a second limb vector.
+dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
-dnl Copyright 2001, 2002, 2004, 2005 Free Software Foundation, Inc.
+dnl Copyright 2005, 2007 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
-dnl Fifth Floor, Boston, MA 02110-1301, USA.
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
include(`../config.m4')
+C TODO:
+C * Tweak eax/edx offsets in loop as to save some lea's
+C * Perhaps software pipeline small-case code
-C P3 model 9 (Banias) ?.?
-C P3 model 13 (Dothan) 5.8
-C P4 model 0 (Willamette) 5.5
-C P4 model 1 (?) 5.5
-C P4 model 2 (Northwood) 5.5
-C P4 model 3 (Prescott) 6.0
-C P4 model 4 (Nocona)
+C cycles/limb
+C P6 model 0-8,10-12) -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 5.24
+C P4 model 0-1 (Willamette): 5
+C P4 model 2 (Northwood): 5
+C P4 model 3-4 (Prescott): 5
-C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t multiplier);
-C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t multiplier, mp_limb_t carry);
-C
-C Only the carry limb propagation is on the dependent chain, but some other
-C Pentium4 pipeline magic brings down performance to 6 cycles/l from the
-C ideal 4 cycles/l.
-
-defframe(PARAM_CARRY, 20)
-defframe(PARAM_MULTIPLIER,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
+C INPUT PARAMETERS
+C rp sp + 4
+C up sp + 8
+C n sp + 12
+C v0 sp + 16
TEXT
ALIGN(16)
PROLOGUE(mpn_addmul_1c)
-deflit(`FRAME',0)
- movd PARAM_CARRY, %mm4
- jmp L(start_1c)
+ mov 4(%esp), %edx
+ mov 8(%esp), %eax
+ mov 12(%esp), %ecx
+ movd 16(%esp), %mm7
+ movd 20(%esp), %mm6
+ jmp .Lent
EPILOGUE()
-
+ ALIGN(16)
PROLOGUE(mpn_addmul_1)
-deflit(`FRAME',0)
- pxor %mm4, %mm4
-L(start_1c):
- movl PARAM_SRC, %eax
- movl PARAM_SIZE, %ecx
- movl PARAM_DST, %edx
- movd PARAM_MULTIPLIER, %mm7
-
- C eax src, incrementing
- C ecx loop counter, decrementing
- C edx dst, incrementing
- C
- C mm4 carry, low 32-bits
- C mm7 multiplier
-
- movd (%eax), %mm2 C ul = up[i]
- pmuludq %mm7, %mm2
-
- shrl $1, %ecx
- jnc L(even)
-
- leal 4(%eax), %eax
- movd (%edx), %mm1
- paddq %mm2, %mm1
- paddq %mm1, %mm4
- movd %mm4, (%edx)
- psrlq $32, %mm4
-
- testl %ecx, %ecx
- jz L(rtn)
- leal 4(%edx), %edx
-
- movd (%eax), %mm2 C ul = up[i]
- pmuludq %mm7, %mm2
-L(even):
- movd 4(%eax), %mm0 C ul = up[i]
- movd (%edx), %mm1 C rl = rp[0]
- pmuludq %mm7, %mm0
-
- subl $1, %ecx
- jz L(end)
-L(loop):
- paddq %mm2, %mm1 C rl += prod
- movd 8(%eax), %mm2 C ul = up[i]
- paddq %mm1, %mm4 C mm4 = prod + cy
- movd 4(%edx), %mm3 C rl = rp[0]
- pmuludq %mm7, %mm2
- movd %mm4, (%edx)
- psrlq $32, %mm4
+ mov 4(%esp), %edx
+ mov 8(%esp), %eax
+ mov 12(%esp), %ecx
+ movd 16(%esp), %mm7
+ pxor %mm6, %mm6
+.Lent: cmp $4, %ecx
+ jnc .Large
+
+.Llp0: movd (%eax), %mm0
+ lea 4(%eax), %eax
+ movd (%edx), %mm4
+ lea 4(%edx), %edx
+ pmuludq %mm7, %mm0
+ paddq %mm0, %mm4
+ paddq %mm4, %mm6
+ movd %mm6, -4(%edx)
+ psrlq $32, %mm6
+ dec %ecx
+ jnz .Llp0
+ movd %mm6, %eax
+ emms
+ ret
- paddq %mm0, %mm3 C rl += prod
- movd 12(%eax), %mm0 C ul = up[i]
- paddq %mm3, %mm4 C mm4 = prod + cy
- movd 8(%edx), %mm1 C rl = rp[0]
- pmuludq %mm7, %mm0
- movd %mm4, 4(%edx)
- psrlq $32, %mm4
+.Large: and $3, %ecx
+ je .L0
+ cmp $2, %ecx
+ jc .L1
+ je .L2
+ jmp .L3 C FIXME: one case should fall through
+
+.L0: movd (%eax), %mm3
+ sub 12(%esp), %ecx C loop count
+ lea -16(%eax), %eax
+ lea -12(%edx), %edx
+ pmuludq %mm7, %mm3
+ movd 20(%eax), %mm0
+ movd 12(%edx), %mm5
+ pmuludq %mm7, %mm0
+ movd 24(%eax), %mm1
+ paddq %mm3, %mm5
+ movd 16(%edx), %mm4
+ jmp .L00
+
+.L1: movd (%eax), %mm2
+ sub 12(%esp), %ecx
+ lea -12(%eax), %eax
+ lea -8(%edx), %edx
+ movd 8(%edx), %mm4
+ pmuludq %mm7, %mm2
+ movd 16(%eax), %mm3
+ pmuludq %mm7, %mm3
+ movd 20(%eax), %mm0
+ paddq %mm2, %mm4
+ movd 12(%edx), %mm5
+ jmp .L01
+
+.L2: movd (%eax), %mm1
+ sub 12(%esp), %ecx
+ lea -8(%eax), %eax
+ lea -4(%edx), %edx
+ pmuludq %mm7, %mm1
+ movd 12(%eax), %mm2
+ movd 4(%edx), %mm5
+ pmuludq %mm7, %mm2
+ movd 16(%eax), %mm3
+ paddq %mm1, %mm5
+ movd 8(%edx), %mm4
+ jmp .L10
+
+.L3: movd (%eax), %mm0
+ sub 12(%esp), %ecx
+ lea -4(%eax), %eax
+ pmuludq %mm7, %mm0
+ movd 8(%eax), %mm1
+ movd (%edx), %mm4
+ pmuludq %mm7, %mm1
+ movd 12(%eax), %mm2
+ paddq %mm0, %mm4
+ movd 4(%edx), %mm5
- leal 8(%eax), %eax
- leal 8(%edx), %edx
- subl $1, %ecx
- jnz L(loop)
-L(end):
- paddq %mm2, %mm1 C rl += prod
- paddq %mm1, %mm4 C mm4 = prod + cy
- movd 4(%edx), %mm3 C rl = rp[0]
- movd %mm4, (%edx)
- psrlq $32, %mm4
- paddq %mm0, %mm3 C rl += prod
- paddq %mm3, %mm4 C mm4 = prod + cy
- movd %mm4, 4(%edx)
- psrlq $32, %mm4
-L(rtn):
- movd %mm4, %eax
+ ALIGN(16)
+.Loop: pmuludq %mm7, %mm2
+ paddq %mm4, %mm6
+ movd 16(%eax), %mm3
+ paddq %mm1, %mm5
+ movd 8(%edx), %mm4
+ movd %mm6, 0(%edx)
+ psrlq $32, %mm6
+.L10: pmuludq %mm7, %mm3
+ paddq %mm5, %mm6
+ movd 20(%eax), %mm0
+ paddq %mm2, %mm4
+ movd 12(%edx), %mm5
+ movd %mm6, 4(%edx)
+ psrlq $32, %mm6
+.L01: pmuludq %mm7, %mm0
+ paddq %mm4, %mm6
+ movd 24(%eax), %mm1
+ paddq %mm3, %mm5
+ movd 16(%edx), %mm4
+ movd %mm6, 8(%edx)
+ psrlq $32, %mm6
+.L00: pmuludq %mm7, %mm1
+ paddq %mm5, %mm6
+ movd 28(%eax), %mm2
+ paddq %mm0, %mm4
+ movd 20(%edx), %mm5
+ movd %mm6, 12(%edx)
+ psrlq $32, %mm6
+ lea 16(%eax), %eax
+ lea 16(%edx), %edx
+ add $4, %ecx
+ jnz .Loop
+
+.Lend: pmuludq %mm7, %mm2
+ paddq %mm4, %mm6
+ paddq %mm1, %mm5
+ movd 8(%edx), %mm4
+ movd %mm6, 0(%edx)
+ psrlq $32, %mm6
+ paddq %mm5, %mm6
+ paddq %mm2, %mm4
+ movd %mm6, 4(%edx)
+ psrlq $32, %mm6
+ paddq %mm4, %mm6
+ movd %mm6, 8(%edx)
+ psrlq $32, %mm6
+ movd %mm6, %eax
emms
ret
-
EPILOGUE()