Optimize. Now really runs at 18 cycles/limb for ev5 as comment claims.

Also runs well on ev6.
author: tege <tege@gmplib.org> 2001-02-12 01:51:47 +0100
committer: tege <tege@gmplib.org> 2001-02-12 01:51:47 +0100
commit: 2879ac3a9a1427c232d3ef06fe979ba016ac1e99 (patch)
tree: 0ca6a801c777d7778fcf097580962b789cc08e39 /mpn/alpha
parent: 8a4b05e64fe3819b51da494eba2ac5d7440dc3d5 (diff)
download: gmp-2879ac3a9a1427c232d3ef06fe979ba016ac1e99.tar.gz
1 files changed, 7 insertions, 7 deletions
diff --git a/mpn/alpha/sqr_diagonal.asm b/mpn/alpha/sqr_diagonal.asm
index 5b4e79f60..19060f926 100644
--- a/mpn/alpha/sqr_diagonal.asm
+++ b/mpn/alpha/sqr_diagonal.asm
@@ -26,7 +26,7 @@ dnl  res_ptr	r16
 dnl  s1_ptr	r17
 dnl  size	r18
 
-dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and ??
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 4
 dnl  cycles/limb on EV6.
 
 ASM_START()
@@ -37,22 +37,22 @@ PROLOGUE(mpn_sqr_diagonal)
 	umulh	r2,r2,r4	C r4 = prod_high
 	blt	r18,$Lend1	C jump if size was == 1
 	ldq	r2,8(r17)	C r2 = s1_limb
-	stq	r3,0(r16)
-	stq	r4,8(r16)
 	beq	r18,$Lend2	C jump if size was == 2
 
 	ALIGN(8)
-$Loop:	mulq	r2,r2,r3	C r3 = prod_low
+$Loop:	stq	r3,0(r16)
+	mulq	r2,r2,r3	C r3 = prod_low
 	lda	r18,-1(r18)	C size--
+	stq	r4,8(r16)
 	umulh	r2,r2,r4	C r4 = cy_limb
 	ldq	r2,16(r17)	C r2 = s1_limb
 	lda	r17,8(r17)	C s1_ptr++
-	stq	r3,16(r16)
-	stq	r4,24(r16)
 	lda	r16,16(r16)	C res_ptr++
 	bne	r18,$Loop
 
-$Lend2:	mulq	r2,r2,r3	C r3 = prod_low
+$Lend2:	stq	r3,0(r16)
+	mulq	r2,r2,r3	C r3 = prod_low
+	stq	r4,8(r16)
 	umulh	r2,r2,r4	C r4 = cy_limb
 	stq	r3,16(r16)
 	stq	r4,24(r16)
author	tege <tege@gmplib.org>	2001-02-12 01:51:47 +0100
committer	tege <tege@gmplib.org>	2001-02-12 01:51:47 +0100
commit	2879ac3a9a1427c232d3ef06fe979ba016ac1e99 (patch)
tree	0ca6a801c777d7778fcf097580962b789cc08e39 /mpn/alpha
parent	8a4b05e64fe3819b51da494eba2ac5d7440dc3d5 (diff)
download	gmp-2879ac3a9a1427c232d3ef06fe979ba016ac1e99.tar.gz