summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Ryde <user42@zip.com.au>2001-04-25 02:46:23 +0200
committerKevin Ryde <user42@zip.com.au>2001-04-25 02:46:23 +0200
commitf47189694ffbe459f0d2b3b689d77906b578146c (patch)
treec35f39df72f2c19147ccc41931e518ba559303e3
parent3d361eec7ff17ad54f1bcf9a3734e13e74cce1b1 (diff)
downloadgmp-f47189694ffbe459f0d2b3b689d77906b578146c.tar.gz
* mpn/x86/k6/mmx/dive_1.asm: New file.
-rw-r--r--mpn/x86/k6/mmx/dive_1.asm290
1 files changed, 290 insertions, 0 deletions
diff --git a/mpn/x86/k6/mmx/dive_1.asm b/mpn/x86/k6/mmx/dive_1.asm
new file mode 100644
index 000000000..bc8cd750d
--- /dev/null
+++ b/mpn/x86/k6/mmx/dive_1.asm
@@ -0,0 +1,290 @@
+dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division.
+dnl
+dnl divisor
+dnl odd even
+dnl K6: 10.0 12.0 cycles/limb
+dnl K6-2: 10.0 11.5
+
+dnl Copyright 2000, 2001 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+include(`../config.m4')
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C A simple divl is used for size==1. This is about 10 cycles faster for an
+C odd divisor or 20 cycles for an even divisor.
+C
+C The loops are quite sensitive to code alignment, speeds should be
+C rechecked (odd and even divisor, pic and non-pic) if contemplating
+C changing anything.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+ TEXT
+
+ ALIGN(32)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+
+ movl PARAM_SRC, %eax
+ xorl %edx, %edx
+
+ cmpl $1, %ecx
+ jnz L(two_or_more)
+
+ movl (%eax), %eax
+
+ divl PARAM_DIVISOR
+
+ movl PARAM_DST, %ecx
+ movl %eax, (%ecx)
+
+ ret
+
+
+L(two_or_more):
+ movl PARAM_DIVISOR, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ pushl %ebp FRAME_pushl()
+
+L(strip_twos):
+ shrl %eax
+ incl %edx C will get shift+1
+
+ jnc L(strip_twos)
+ pushl %esi FRAME_pushl()
+
+ leal 1(%eax,%eax), %esi C d without twos
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ call L(movl_eip_ebp)
+
+ addl $_GLOBAL_OFFSET_TABLE_, %ebp
+ C
+ movl modlimb_invert_table@GOT(%ebp), %ebp
+ C
+Zdisp( movzbl, 0,(%eax,%ebp), %eax)
+',`
+
+dnl non-PIC
+ movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
+')
+ pushl %edi FRAME_pushl()
+
+ leal (%eax,%eax), %ebp C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ movl PARAM_DST, %edi
+
+ imull %esi, %eax C inv*inv*d
+
+ subl %eax, %ebp C inv = 2*inv - inv*inv*d
+ leal (%ebp,%ebp), %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ movl %esi, PARAM_DIVISOR C d without twos
+ leal (%ebx,%ecx,4), %ebx C src end
+
+ imull %esi, %ebp C inv*inv*d
+
+ leal (%edi,%ecx,4), %edi C dst end
+ negl %ecx C -size
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ subl $1, %edx C shift amount, and clear carry
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ movl %eax, VAR_INVERSE
+ jnz L(even)
+
+ movl (%ebx,%ecx,4), %esi C src low limb
+ jmp L(odd_entry)
+
+
+ ALIGN(16)
+ nop C code alignment
+L(odd_top):
+ C eax scratch
+ C ebx src end
+ C ecx counter, limbs, negative
+ C edx inverse
+ C esi next limb, adjusted for carry
+ C edi dst end
+ C ebp carry bit, 0 or -1
+
+ imull %edx, %esi
+
+ movl PARAM_DIVISOR, %eax
+ movl %esi, -4(%edi,%ecx,4)
+
+ mull %esi C carry limb in edx
+
+ subl %ebp, %edx C apply carry bit
+ movl (%ebx,%ecx,4), %esi
+
+L(odd_entry):
+ subl %edx, %esi C apply carry limb
+ movl VAR_INVERSE, %edx
+
+ sbbl %ebp, %ebp C 0 or -1
+
+ incl %ecx
+ jnz L(odd_top)
+
+
+ imull %edx, %esi
+
+ movl %esi, -4(%edi,%ecx,4)
+
+ popl %edi
+ popl %esi
+
+ popl %ebp
+ popl %ebx
+
+ ret
+
+
+ifdef(`PIC',`
+L(movl_eip_ebp):
+ movl (%esp), %ebp
+ ret
+
+ ALIGN(8)
+ nop C code alignment, necessary for claimed speed
+ nop
+',`
+C non-PIC code alignment already ok at 0x9a
+')
+
+L(even):
+ C eax
+ C ebx src end
+ C ecx -size
+ C edx twos
+ C esi
+ C edi dst end
+ C ebp
+
+ xorl %ebp, %ebp
+Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1]
+
+ movd %edx, %mm7
+ movl VAR_INVERSE, %edx
+
+ addl $2, %ecx
+ psrlq %mm7, %mm0
+
+ movd %mm0, %esi
+ jz L(even_two) C if only two limbs
+
+
+C Out-of-order execution is good enough to hide the load/rshift/movd
+C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12,
+C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has
+C been found. Maybe the fact every second movq is unaligned costs the extra
+C 0.5.
+
+L(even_top):
+ C eax scratch
+ C ebx src end
+ C ecx counter, limbs, negative
+ C edx inverse
+ C esi next limb, adjusted for carry
+ C edi dst end
+ C ebp carry bit, 0 or -1
+ C
+ C mm0 scratch, source limbs
+ C mm7 twos
+
+ imull %edx, %esi
+
+ movl %esi, -8(%edi,%ecx,4)
+ movl PARAM_DIVISOR, %eax
+
+ mull %esi C carry limb in edx
+
+ movq -4(%ebx,%ecx,4), %mm0
+ psrlq %mm7, %mm0
+
+ movd %mm0, %esi
+ subl %ebp, %edx C apply carry bit
+
+ subl %edx, %esi C apply carry limb
+ movl VAR_INVERSE, %edx
+
+ sbbl %ebp, %ebp C 0 or -1
+
+ incl %ecx
+ jnz L(even_top)
+
+
+L(even_two):
+ movd -4(%ebx), %mm0 C src high limb
+ psrlq %mm7, %mm0
+
+ imull %edx, %esi
+
+ movl %esi, -8(%edi)
+ movl PARAM_DIVISOR, %eax
+
+ mull %esi C carry limb in edx
+
+ movd %mm0, %esi
+ subl %ebp, %edx C apply carry bit
+
+ movl VAR_INVERSE, %eax
+ subl %edx, %esi C apply carry limb
+
+ imull %eax, %esi
+
+ movl %esi, -4(%edi)
+
+ popl %edi
+ popl %esi
+
+ popl %ebp
+ popl %ebx
+
+ emms_or_femms
+
+ ret
+
+EPILOGUE()