Swap multiply insns to make them consecutive, for the benefit of POWER6.

author: Torbjorn Granlund <tege@gmplib.org> 2010-04-21 21:02:49 +0200
committer: Torbjorn Granlund <tege@gmplib.org> 2010-04-21 21:02:49 +0200
commit: eb76cf0171a29adbb785aaf4987c2a36c7a8bf7c (patch)
tree: 3a200cae605a9c3f74a74cc04c0a0b8f62680270 /mpn/powerpc64
parent: ef6a7533b04cb10e7f96e3c24bc33a4d9cbce943 (diff)
download: gmp-eb76cf0171a29adbb785aaf4987c2a36c7a8bf7c.tar.gz
4 files changed, 38 insertions, 49 deletions
diff --git a/mpn/powerpc64/mode64/aorsmul_1.asm b/mpn/powerpc64/mode64/aorsmul_1.asm
index 939b2fe4d..f7ac9f002 100644
--- a/mpn/powerpc64/mode64/aorsmul_1.asm
+++ b/mpn/powerpc64/mode64/aorsmul_1.asm
@@ -22,9 +22,10 @@ include(`../config.m4')
 
 C		mpn_addmul_1	mpn_submul_1
 C		cycles/limb	cycles/limb
-C POWER3/PPC630:    6-18	    6-18
-C POWER4/PPC970:    8		    8.3
-C POWER5:           8		    ?
+C POWER3/PPC630   6-18		   6-18
+C POWER4/PPC970	   8?		    8.3?  not updated for last file revision
+C POWER5	   8		    8.75
+C POWER6	  16		   16.5
 
 C TODO
 C  * Try to reduce the number of needed live registers
@@ -118,10 +119,10 @@ L(gt1):	ld	r9, 0(up)
 	ld	r27, 8(up)
 	mulld	r0, r9, r6
 	mulhdu	r5, r9, r6
-	ld	r9, 16(up)
-	ld	r28, 0(rp)
 	mulld	r7, r27, r6
 	mulhdu	r8, r27, r6
+	ld	r9, 16(up)
+	ld	r28, 0(rp)
 	ld	r29, 8(rp)
 	ld	r30, 16(rp)
 	mulld	r11, r9, r6
@@ -151,20 +152,20 @@ L(b10):	addic	r0, r0, 0
 	ALIGN(16)
 L(top):	mulld	r0, r9, r6
 	mulhdu	r5, r9, r6	C 9
-	ld	r9, 0(up)
-	ld	r28, 0(rp)
 	mulld	r7, r27, r6
 	mulhdu	r8, r27, r6	C 27
+	ld	r9, 0(up)
+	ld	r28, 0(rp)
 	ld	r27, 8(up)
 	ld	r29, 8(rp)
 	adde	r0, r0, r12	C 0 12
 	adde	r7, r7, r5	C 5 7
 	mulld	r5, r9, r6
 	mulhdu	r10, r9, r6	C 9
-	ld	r9, 16(up)
-	ld	r30, 16(rp)
 	mulld	r11, r27, r6
 	mulhdu	r12, r27, r6	C 27
+	ld	r9, 16(up)
+	ld	r30, 16(rp)
 	ld	r27, 24(up)
 	ld	r31, 24(rp)
 	adde	r5, r5, r8	C 8 5
@@ -185,12 +186,10 @@ L(bot):	INVCY(r11)
 
 L(end):	mulld	r0, r9, r6
 	mulhdu	r5, r9, r6
-	ld	r28, 0(rp)
-	nop
 	mulld	r7, r27, r6
 	mulhdu	r8, r27, r6
+	ld	r28, 0(rp)
 	ld	r29, 8(rp)
-	nop
 	adde	r0, r0, r12
 	adde	r7, r7, r5
 	addze	r8, r8
diff --git a/mpn/powerpc64/mode64/dive_1.asm b/mpn/powerpc64/mode64/dive_1.asm
index 1f482bae5..d457d65e9 100644
--- a/mpn/powerpc64/mode64/dive_1.asm
+++ b/mpn/powerpc64/mode64/dive_1.asm
@@ -1,6 +1,6 @@
 dnl  PowerPC-64 mpn_divexact_1 -- mpn by limb exact division.
 
-dnl  Copyright 2006 Free Software Foundation, Inc.
+dnl  Copyright 2006, 2010 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -19,14 +19,15 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C		cycles/limb
-C POWER3/PPC630:    13-19
-C POWER4/PPC970:     16
-C POWER5:	     16
+C			cycles/limb
+C			norm	unorm
+C POWER3/PPC630        13-19
+C POWER4/PPC970		16
+C POWER5		16	16
+C POWER6		37	46
 
 C TODO
 C  * Check if n=1 code is really an improvement.  It probably isn't.
-C  * Perhaps remove L(norm) code, it is currently unreachable.
 C  * Make more similar to mode1o.asm.
 
 C INPUT PARAMETERS
@@ -61,7 +62,6 @@ L(7):
 	mtctr	n
 	LEA(	r5, binvert_limb_table)
 	rldicl	r11, d, 63, 57
-C	cmpdi	cr7, r0, 0
 	lbzx	r0, r5, r11
 	mulld	r9, r0, r0
 	sldi	r0, r0, 1
@@ -75,26 +75,27 @@ C	cmpdi	cr7, r0, 0
 	sldi	r0, r0, 1
 	mulld	r9, d, r9
 	subf	r7, r9, r0		C r7 = 1/d mod 2^64
-C	beq	cr7, L(norm)
+	bne	cr0, L(norm)
 	subfic	r8, r10, 64		C set carry as side effect
 	li	r5, 0
+	srd	r11, r12, r10
 
 	ALIGN(16)
 L(loop0):
-	srd	r11, r12, r10
 	ld	r12, 8(up)
+	nop
 	addi	up, up, 8
 	sld	r0, r12, r8
 	or	r11, r11, r0
 	subfe	r9, r5, r11
+	srd	r11, r12, r10
 	mulld	r0, r7, r9
+	mulhdu	r5, r0, d
 	std	r0, 0(rp)
 	addi	rp, rp, 8
-	mulhdu	r5, r0, d
 	bdnz	L(loop0)
 
-	srd	r0, r12, r10
-	subfe	r0, r5, r0
+	subfe	r0, r5, r11
 	mulld	r0, r7, r0
 	std	r0, 0(rp)
 	blr
@@ -102,14 +103,15 @@ L(loop0):
 	ALIGN(16)
 L(norm):
 	mulld	r11, r12, r7
+	mulhdu	r5, r11, d
 	std	r11, 0(rp)
 	ALIGN(16)
 L(loop1):
-	mulhdu	r5, r11, d
 	ld	r9, 8(up)
 	addi	up, up, 8
 	subfe	r5, r5, r9
 	mulld	r11, r7, r5
+	mulhdu	r5, r11, d	C result not used
 	std	r11, 8(rp)
 	addi	rp, rp, 8
 	bdnz	L(loop1)
diff --git a/mpn/powerpc64/mode64/divrem_2.asm b/mpn/powerpc64/mode64/divrem_2.asm
index 369b5c1f1..53ef1c708 100644
--- a/mpn/powerpc64/mode64/divrem_2.asm
+++ b/mpn/powerpc64/mode64/divrem_2.asm
@@ -22,11 +22,9 @@ include(`../config.m4')
 C			cycles/limb
 C			norm	frac
 C POWER3/PPC630
-C POWER4/PPC970		39*	39*
-C POWER5		39*	39*
-
-C STATUS
-C  * Performace fluctuates like crazy
+C POWER4/PPC970		?	?
+C POWER5		37	?
+C POWER6		62	?
 
 C INPUT PARAMETERS
 C qp  = r3
@@ -121,12 +119,12 @@ L(loop):
 	mulld	r6, r29, r3
 	addc	r6, r6, r31
 	adde	r8, r8, r29
+	cmpd	cr7, r27, r25
 	mulld	r0, r30, r8
-	subf	r31, r0, r31
 	mulhdu	r11, r28, r8
 	mulld	r10, r28, r8
+	subf	r31, r0, r31
 	li	r7, 0
-	cmpd	cr7, r27, r25
 	blt	cr7, L(60)
 	ld	r7, 0(r26)
 	addi	r26, r26, -8
diff --git a/mpn/powerpc64/mode64/mul_1.asm b/mpn/powerpc64/mode64/mul_1.asm
index 8f644d871..12bff2fb6 100644
--- a/mpn/powerpc64/mode64/mul_1.asm
+++ b/mpn/powerpc64/mode64/mul_1.asm
@@ -1,7 +1,7 @@
 dnl  PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
 dnl  the result in a second limb vector.
 
-dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software
+dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2010 Free Software
 dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
@@ -22,9 +22,10 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 include(`../config.m4')
 
 C		cycles/limb
-C POWER3/PPC630:     6-18
-C POWER4/PPC970:     7.25
-C POWER5:            7.75
+C POWER3/PPC630	    6-18
+C POWER4/PPC970	    7.25?  not updated for last file revision
+C POWER5	    7.25
+C POWER6	   14
 
 C TODO
 C  * Try to reduce the number of needed live registers (at least r5 and r10
@@ -118,26 +119,18 @@ L(b10):	ld	r27, 8(up)
 
 L(top):	mulld	r0, r26, r6
 	mulhdu	r5, r26, r6
-	ld	r26, 0(up)
-	nop
-
 	mulld	r7, r27, r6
 	mulhdu	r8, r27, r6
+	ld	r26, 0(up)
 	ld	r27, 8(up)
-	nop
-
 	adde	r0, r0, r12
 	adde	r7, r7, r5
-
 	mulld	r9, r26, r6
 	mulhdu	r10, r26, r6
-	ld	r26, 16(up)
-	nop
-
 	mulld	r11, r27, r6
 	mulhdu	r12, r27, r6
+	ld	r26, 16(up)
 	ld	r27, 24(up)
-
 	std	r0, 0(rp)
 	adde	r9, r9, r8
 	std	r7, 8(rp)
@@ -151,13 +144,10 @@ L(top):	mulld	r0, r26, r6
 
 L(end):	mulld	r0, r26, r6
 	mulhdu	r5, r26, r6
-
 	mulld	r7, r27, r6
 	mulhdu	r8, r27, r6
-
 	adde	r0, r0, r12
 	adde	r7, r7, r5
-
 	std	r0, 0(rp)
 	std	r7, 8(rp)
 L(ret):	addze	r3, r8
author	Torbjorn Granlund <tege@gmplib.org>	2010-04-21 21:02:49 +0200
committer	Torbjorn Granlund <tege@gmplib.org>	2010-04-21 21:02:49 +0200
commit	eb76cf0171a29adbb785aaf4987c2a36c7a8bf7c (patch)
tree	3a200cae605a9c3f74a74cc04c0a0b8f62680270 /mpn/powerpc64
parent	ef6a7533b04cb10e7f96e3c24bc33a4d9cbce943 (diff)
download	gmp-eb76cf0171a29adbb785aaf4987c2a36c7a8bf7c.tar.gz