* mpn/x86/k6/mmx/dive_1.asm: New file.

author: Kevin Ryde <user42@zip.com.au> 2001-04-25 02:46:23 +0200
committer: Kevin Ryde <user42@zip.com.au> 2001-04-25 02:46:23 +0200
commit: f47189694ffbe459f0d2b3b689d77906b578146c (patch)
tree: c35f39df72f2c19147ccc41931e518ba559303e3
parent: 3d361eec7ff17ad54f1bcf9a3734e13e74cce1b1 (diff)
download: gmp-f47189694ffbe459f0d2b3b689d77906b578146c.tar.gz
1 files changed, 290 insertions, 0 deletions
diff --git a/mpn/x86/k6/mmx/dive_1.asm b/mpn/x86/k6/mmx/dive_1.asm
new file mode 100644
index 000000000..bc8cd750d
--- /dev/null
+++ b/mpn/x86/k6/mmx/dive_1.asm
@@ -0,0 +1,290 @@
+dnl  AMD K6 mpn_divexact_1 -- mpn by limb exact division.
+dnl
+dnl          divisor
+dnl        odd   even
+dnl  K6:   10.0  12.0  cycles/limb 
+dnl  K6-2: 10.0  11.5
+
+dnl  Copyright 2000, 2001 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+include(`../config.m4')
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C A simple divl is used for size==1.  This is about 10 cycles faster for an
+C odd divisor or 20 cycles for an even divisor.
+C
+C The loops are quite sensitive to code alignment, speeds should be
+C rechecked (odd and even divisor, pic and non-pic) if contemplating
+C changing anything.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+	TEXT
+
+	ALIGN(32)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+
+	movl	PARAM_SRC, %eax
+	xorl	%edx, %edx
+
+	cmpl	$1, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%eax), %eax
+
+	divl	PARAM_DIVISOR
+
+	movl	PARAM_DST, %ecx
+	movl	%eax, (%ecx)
+
+	ret
+
+
+L(two_or_more):
+	movl	PARAM_DIVISOR, %eax
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+L(strip_twos):
+	shrl	%eax
+	incl	%edx			C will get shift+1
+
+	jnc	L(strip_twos)
+	pushl	%esi		FRAME_pushl()
+
+	leal	1(%eax,%eax), %esi	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	call	L(movl_eip_ebp)
+
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebp
+	C
+	movl	modlimb_invert_table@GOT(%ebp), %ebp
+	C
+Zdisp(	movzbl,	0,(%eax,%ebp), %eax)
+',`
+
+dnl non-PIC
+	movzbl	modlimb_invert_table(%eax), %eax	C inv 8 bits
+')
+	pushl	%edi		FRAME_pushl()
+
+	leal	(%eax,%eax), %ebp	C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	movl	PARAM_DST, %edi
+
+	imull	%esi, %eax		C inv*inv*d
+
+	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
+	leal	(%ebp,%ebp), %eax	C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	movl	%esi, PARAM_DIVISOR	C d without twos
+	leal	(%ebx,%ecx,4), %ebx	C src end
+
+	imull	%esi, %ebp		C inv*inv*d
+
+	leal	(%edi,%ecx,4), %edi	C dst end
+	negl	%ecx			C -size
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	subl	$1, %edx		C shift amount, and clear carry
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	movl	%eax, VAR_INVERSE
+	jnz	L(even)
+
+	movl	(%ebx,%ecx,4), %esi	C src low limb
+	jmp	L(odd_entry)
+
+
+	ALIGN(16)
+	nop	C code alignment
+L(odd_top):
+	C eax	scratch
+	C ebx	src end
+	C ecx	counter, limbs, negative
+	C edx	inverse
+	C esi	next limb, adjusted for carry
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	imull	%edx, %esi
+
+	movl	PARAM_DIVISOR, %eax
+	movl	%esi, -4(%edi,%ecx,4)
+
+	mull	%esi			C carry limb in edx
+
+	subl	%ebp, %edx		C apply carry bit
+	movl	(%ebx,%ecx,4), %esi
+
+L(odd_entry):
+	subl	%edx, %esi		C apply carry limb
+	movl	VAR_INVERSE, %edx
+
+	sbbl	%ebp, %ebp		C 0 or -1
+
+	incl	%ecx
+	jnz	L(odd_top)
+
+
+	imull	%edx, %esi
+
+	movl	%esi, -4(%edi,%ecx,4)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+
+ifdef(`PIC',`
+L(movl_eip_ebp):
+	movl	(%esp), %ebp
+	ret
+
+	ALIGN(8)
+	nop	C code alignment, necessary for claimed speed
+	nop
+',`
+C non-PIC code alignment already ok at 0x9a
+')
+
+L(even):
+	C eax
+	C ebx	src end
+	C ecx	-size
+	C edx	twos
+	C esi
+	C edi	dst end
+	C ebp
+
+	xorl	%ebp, %ebp
+Zdisp(	movq,	0,(%ebx,%ecx,4), %mm0)	C src[0,1]
+
+	movd	%edx, %mm7
+	movl	VAR_INVERSE, %edx
+
+	addl	$2, %ecx
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi
+	jz	L(even_two)		C if only two limbs
+
+
+C Out-of-order execution is good enough to hide the load/rshift/movd
+C latency.  Having imul at the top of the loop gives 11.5 c/l instead of 12,
+C on K6-2.  In fact there's only 11 of decode, but nothing running at 11 has
+C been found.  Maybe the fact every second movq is unaligned costs the extra
+C 0.5.
+
+L(even_top):
+	C eax	scratch
+	C ebx	src end
+	C ecx	counter, limbs, negative
+	C edx	inverse
+	C esi	next limb, adjusted for carry
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+	C
+	C mm0	scratch, source limbs
+	C mm7	twos
+
+	imull	%edx, %esi
+
+	movl	%esi, -8(%edi,%ecx,4)
+	movl	PARAM_DIVISOR, %eax
+
+	mull	%esi			C carry limb in edx
+
+	movq	-4(%ebx,%ecx,4), %mm0
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi
+	subl	%ebp, %edx		C apply carry bit
+
+	subl	%edx, %esi		C apply carry limb
+	movl	VAR_INVERSE, %edx
+
+	sbbl	%ebp, %ebp		C 0 or -1
+
+	incl	%ecx
+	jnz	L(even_top)
+
+
+L(even_two):
+	movd	-4(%ebx), %mm0		C src high limb
+	psrlq	%mm7, %mm0
+
+	imull	%edx, %esi
+
+	movl	%esi, -8(%edi)
+	movl	PARAM_DIVISOR, %eax
+
+	mull	%esi			C carry limb in edx
+
+	movd	%mm0, %esi
+	subl	%ebp, %edx		C apply carry bit
+
+	movl	VAR_INVERSE, %eax
+	subl	%edx, %esi		C apply carry limb
+
+	imull	%eax, %esi
+
+	movl	%esi, -4(%edi)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	emms_or_femms
+
+	ret
+
+EPILOGUE()
author	Kevin Ryde <user42@zip.com.au>	2001-04-25 02:46:23 +0200
committer	Kevin Ryde <user42@zip.com.au>	2001-04-25 02:46:23 +0200
commit	f47189694ffbe459f0d2b3b689d77906b578146c (patch)
tree	c35f39df72f2c19147ccc41931e518ba559303e3
parent	3d361eec7ff17ad54f1bcf9a3734e13e74cce1b1 (diff)
download	gmp-f47189694ffbe459f0d2b3b689d77906b578146c.tar.gz