Rewrite, smaller and slightly faster.

author: Kevin Ryde <user42@zip.com.au> 2000-03-12 22:44:25 +0100
committer: Kevin Ryde <user42@zip.com.au> 2000-03-12 22:44:25 +0100
commit: 539a50edce44305c5439f9a79f03ca64baf6386b (patch)
tree: 7e8ee426418aa70e6ab1ce87be1b2509ba34d5a3 /mpn
parent: 0e21ad2d8c2a1b6e3fe35ed9706a559d83ed2549 (diff)
download: gmp-539a50edce44305c5439f9a79f03ca64baf6386b.tar.gz
1 files changed, 131 insertions, 91 deletions
diff --git a/mpn/x86/k6/mul_1.asm b/mpn/x86/k6/mul_1.asm
index 6321c013f..beadf5832 100644
--- a/mpn/x86/k6/mul_1.asm
+++ b/mpn/x86/k6/mul_1.asm
@@ -1,7 +1,6 @@
 # AMD K6 mpn_mul_1 -- mpn by limb multiply.
 #
-# K6: 6.4 cycles/limb in the loop (at 16 limbs/loop), PIC adds 4 cycles at
-# the start of the unrolled loop.
+# K6: 6.25 cycles/limb.
 
 
 # Copyright (C) 1999-2000 Free Software Foundation, Inc.
@@ -26,24 +25,6 @@
 include(`../config.m4')
 
 
-dnl  Unrolling to 8 pushes the code size for the loop just just over a 32
-dnl  byte boundary, so perhaps extra code fetching explains the poorer speed
-dnl  on this unrolling.  Unrolling to 16 gets the loop code nicely in a 256
-dnl  byte block, which might explain why it's as good as 32 or 64.
-dnl
-dnl  K6 UNROLL_COUNT cycles/limb
-dnl           4          6.8
-dnl           8          7.5
-dnl          16          6.4
-dnl          32          6.4
-dnl          64          6.4
-dnl
-dnl  Cf. simple loop is 8.0.
-dnl  Maximum unrolling possible with the current code is 64.
-
-deflit(UNROLL_COUNT, 16)
-
-
 # mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
 #                      mp_limb_t multiplier);
 # mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
@@ -54,18 +35,6 @@ deflit(UNROLL_COUNT, 16)
 #
 # mpn_mul_1c() accepts an initial carry for the calculation, it's added into
 # the low limb of the result.
-#
-# In the unrolled loop it's 6 cycles/limb, with instruction decoding being
-# the limiting factor.
-#
-# The jacdl0() technique used in mpn_add/submul_1 doesn't help in the
-# unrolled loop here, using it comes out a touch slower than adcl.
-
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 11)
-',`
-deflit(UNROLL_THRESHOLD, 9)
-')
 
 defframe(PARAM_CARRY,     20)
 defframe(PARAM_MULTIPLIER,16)
@@ -73,6 +42,9 @@ defframe(PARAM_SIZE,      12)
 defframe(PARAM_SRC,       8)
 defframe(PARAM_DST,       4)
 
+dnl  minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
 	.text
 	ALIGN(32)
 
@@ -92,23 +64,23 @@ deflit(`FRAME',4)
 L(start_nc):
 	mov	PARAM_SIZE, %ecx
 	push	%ebx
-deflit(`FRAME',8)
+FRAME_pushl()
 
 	movl	PARAM_SRC, %ebx
 	push	%edi
-deflit(`FRAME',12)
+FRAME_pushl()
 
 	movl	PARAM_DST, %edi
 	pushl	%ebp
-deflit(`FRAME',16)
+FRAME_pushl()
 
+	cmpl	$UNROLL_THRESHOLD, %ecx
 	movl	PARAM_MULTIPLIER, %ebp
-	cmp	$UNROLL_THRESHOLD, %ecx
 
 	jae	L(unroll)
 
 
-	# this is offset 0x22, which is close enough to aligned
+	# code offset 0x22 here, close enough to aligned
 L(simple):
 	# eax	scratch
 	# ebx	src
@@ -117,6 +89,8 @@ L(simple):
 	# esi	carry
 	# edi	dst
 	# ebp	multiplier
+	#
+	# this loop 8 cycles/limb
 
 	movl	(%ebx), %eax
 	addl	$4, %ebx
@@ -145,87 +119,153 @@ L(simple):
 	ret
 
 
-#-----------------------------------------------------------------------------
-	ALIGN(16)
+#------------------------------------------------------------------------------
+# The code for each limb is 6 cycles, with instruction decoding being the
+# limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
+# cycles/limb in total.
+#
+# The secret ingredient to get 6.25 is to start the loop with the mul and
+# have the load/store pair at the end.  Rotating the load/store to the top
+# is an 0.5 c/l slowdown.  (Don't quite know why.)
+#
+# The whole unrolled loop fits nicely in exactly 80 bytes.
+
+
+	ALIGN(16)	# already aligned to 16 here actually
 L(unroll):
-	movl	%ecx, %eax
+	movl	(%ebx), %eax
+	leal	-16(%ebx,%ecx,4), %ebx
+
+	leal	-16(%edi,%ecx,4), %edi
+	subl	$4, %ecx
+
 	negl	%ecx
 
-	andl	$UNROLL_MASK, %ecx
-	movl	$0, %edx
 
-	subl	%ecx, %edx
-	shll	$4, %ecx
+	ALIGN(16)	# one byte nop for this alignment
+L(top):
+	# eax	scratch
+	# ebx	&src[size-4]
+	# ecx	counter
+	# edx	scratch
+	# esi	carry
+	# edi	&dst[size-4]
+	# ebp	multiplier
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
 
-	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edx,4), %ebx
-	decl	%eax
+	adcl	%edx, %esi
 
-	shrl	$UNROLL_LOG2, %eax
-	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%edx,4), %edi
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
 
-	# 15 code bytes/limb
-ifdef(`PIC',`
-	call	L(pic_calc)
-L(here):
-',`
-	leal	L(entry) (%edx,%ecx), %edx
-')
-	mov	%eax, %ecx
-	
-	jmp	*%edx
 
+	mull	%ebp
 
-ifdef(`PIC',`
-L(pic_calc):
-	# See README.family about old gas bugs
-	leal	(%edx,%ecx), %edx
-	addl	$L(entry)-L(here), %edx
-	addl	(%esp), %edx
-	ret
-')
+	addl	%esi, %eax
+	movl	$0, %esi
 
+	adcl	%edx, %esi
 
-#----------------------------------------------------------------------------
-# need 32 byte alignment here to get the claimed speed
-	ALIGN(32)
-L(top):
-	# eax	scratch
-	# ebx	src
-	# ecx	loop counter
-	# edx	scratch
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi,%ecx,4)
+	movl	12(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 12(%edi,%ecx,4)
+	movl	16(%ebx,%ecx,4), %eax
+
+
+	addl	$4, %ecx
+	js	L(top)
+
+
+
+	# eax	next src limb
+	# ebx	&src[size-4]
+	# ecx	0 to 3 representing respectively 4 to 1 further limbs
+	# edx
 	# esi	carry
-	# edi	dst
-	# ebp	multiplier
-	#
-	# 15 code bytes/limb
+	# edi	&dst[size-4]
+
+	testb	$2, %cl
+	jnz	L(finish_not_two)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
 
-	leal	UNROLL_BYTES(%edi), %edi
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
 
-L(entry):
-forloop(i, 0, UNROLL_COUNT-1, `
-	deflit(`disp', eval(i*4 ifelse(UNROLL_BYTES,256,-128)))
 
-Zdisp(	movl,	disp,(%ebx), %eax)
 	mull	%ebp
+
 	addl	%esi, %eax
-	movl	%edx, %esi
-	adcl	$0, %esi
-Zdisp(	movl,	%eax, disp,(%edi))
-')
+	movl	$0, %esi
+
+	adcl	%edx, %esi
 
-	decl	%ecx
-	leal	UNROLL_BYTES(%ebx), %ebx
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
 
-	jns	L(top)
+	addl	$2, %ecx
+L(finish_not_two):
 
 
+	testb	$1, %cl
+	jnz	L(finish_not_one)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi)
+	movl	12(%ebx), %eax
+L(finish_not_one):
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
 	popl	%ebp
+
+	adcl	$0, %edx
+
+	movl	%eax, 12(%edi)
 	popl	%edi
 
 	popl	%ebx
-	movl	%esi, %eax
+	movl	%edx, %eax
 
 	popl	%esi
+
 	ret
 
 EPILOGUE()
author	Kevin Ryde <user42@zip.com.au>	2000-03-12 22:44:25 +0100
committer	Kevin Ryde <user42@zip.com.au>	2000-03-12 22:44:25 +0100
commit	539a50edce44305c5439f9a79f03ca64baf6386b (patch)
tree	7e8ee426418aa70e6ab1ce87be1b2509ba34d5a3 /mpn
parent	0e21ad2d8c2a1b6e3fe35ed9706a559d83ed2549 (diff)
download	gmp-539a50edce44305c5439f9a79f03ca64baf6386b.tar.gz