summaryrefslogtreecommitdiff
path: root/mpn/powerpc64
diff options
context:
space:
mode:
authorTorbjorn Granlund <tege@gmplib.org>2010-04-21 21:02:49 +0200
committerTorbjorn Granlund <tege@gmplib.org>2010-04-21 21:02:49 +0200
commiteb76cf0171a29adbb785aaf4987c2a36c7a8bf7c (patch)
tree3a200cae605a9c3f74a74cc04c0a0b8f62680270 /mpn/powerpc64
parentef6a7533b04cb10e7f96e3c24bc33a4d9cbce943 (diff)
downloadgmp-eb76cf0171a29adbb785aaf4987c2a36c7a8bf7c.tar.gz
Swap multiply insns to make them consecutive, for the benefit of POWER6.
Diffstat (limited to 'mpn/powerpc64')
-rw-r--r--mpn/powerpc64/mode64/aorsmul_1.asm23
-rw-r--r--mpn/powerpc64/mode64/dive_1.asm28
-rw-r--r--mpn/powerpc64/mode64/divrem_2.asm12
-rw-r--r--mpn/powerpc64/mode64/mul_1.asm24
4 files changed, 38 insertions, 49 deletions
diff --git a/mpn/powerpc64/mode64/aorsmul_1.asm b/mpn/powerpc64/mode64/aorsmul_1.asm
index 939b2fe4d..f7ac9f002 100644
--- a/mpn/powerpc64/mode64/aorsmul_1.asm
+++ b/mpn/powerpc64/mode64/aorsmul_1.asm
@@ -22,9 +22,10 @@ include(`../config.m4')
C mpn_addmul_1 mpn_submul_1
C cycles/limb cycles/limb
-C POWER3/PPC630: 6-18 6-18
-C POWER4/PPC970: 8 8.3
-C POWER5: 8 ?
+C POWER3/PPC630 6-18 6-18
+C POWER4/PPC970 8? 8.3? not updated for last file revision
+C POWER5 8 8.75
+C POWER6 16 16.5
C TODO
C * Try to reduce the number of needed live registers
@@ -118,10 +119,10 @@ L(gt1): ld r9, 0(up)
ld r27, 8(up)
mulld r0, r9, r6
mulhdu r5, r9, r6
- ld r9, 16(up)
- ld r28, 0(rp)
mulld r7, r27, r6
mulhdu r8, r27, r6
+ ld r9, 16(up)
+ ld r28, 0(rp)
ld r29, 8(rp)
ld r30, 16(rp)
mulld r11, r9, r6
@@ -151,20 +152,20 @@ L(b10): addic r0, r0, 0
ALIGN(16)
L(top): mulld r0, r9, r6
mulhdu r5, r9, r6 C 9
- ld r9, 0(up)
- ld r28, 0(rp)
mulld r7, r27, r6
mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
ld r27, 8(up)
ld r29, 8(rp)
adde r0, r0, r12 C 0 12
adde r7, r7, r5 C 5 7
mulld r5, r9, r6
mulhdu r10, r9, r6 C 9
- ld r9, 16(up)
- ld r30, 16(rp)
mulld r11, r27, r6
mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
ld r27, 24(up)
ld r31, 24(rp)
adde r5, r5, r8 C 8 5
@@ -185,12 +186,10 @@ L(bot): INVCY(r11)
L(end): mulld r0, r9, r6
mulhdu r5, r9, r6
- ld r28, 0(rp)
- nop
mulld r7, r27, r6
mulhdu r8, r27, r6
+ ld r28, 0(rp)
ld r29, 8(rp)
- nop
adde r0, r0, r12
adde r7, r7, r5
addze r8, r8
diff --git a/mpn/powerpc64/mode64/dive_1.asm b/mpn/powerpc64/mode64/dive_1.asm
index 1f482bae5..d457d65e9 100644
--- a/mpn/powerpc64/mode64/dive_1.asm
+++ b/mpn/powerpc64/mode64/dive_1.asm
@@ -1,6 +1,6 @@
dnl PowerPC-64 mpn_divexact_1 -- mpn by limb exact division.
-dnl Copyright 2006 Free Software Foundation, Inc.
+dnl Copyright 2006, 2010 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -19,14 +19,15 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: 13-19
-C POWER4/PPC970: 16
-C POWER5: 16
+C cycles/limb
+C norm unorm
+C POWER3/PPC630 13-19
+C POWER4/PPC970 16
+C POWER5 16 16
+C POWER6 37 46
C TODO
C * Check if n=1 code is really an improvement. It probably isn't.
-C * Perhaps remove L(norm) code, it is currently unreachable.
C * Make more similar to mode1o.asm.
C INPUT PARAMETERS
@@ -61,7 +62,6 @@ L(7):
mtctr n
LEA( r5, binvert_limb_table)
rldicl r11, d, 63, 57
-C cmpdi cr7, r0, 0
lbzx r0, r5, r11
mulld r9, r0, r0
sldi r0, r0, 1
@@ -75,26 +75,27 @@ C cmpdi cr7, r0, 0
sldi r0, r0, 1
mulld r9, d, r9
subf r7, r9, r0 C r7 = 1/d mod 2^64
-C beq cr7, L(norm)
+ bne cr0, L(norm)
subfic r8, r10, 64 C set carry as side effect
li r5, 0
+ srd r11, r12, r10
ALIGN(16)
L(loop0):
- srd r11, r12, r10
ld r12, 8(up)
+ nop
addi up, up, 8
sld r0, r12, r8
or r11, r11, r0
subfe r9, r5, r11
+ srd r11, r12, r10
mulld r0, r7, r9
+ mulhdu r5, r0, d
std r0, 0(rp)
addi rp, rp, 8
- mulhdu r5, r0, d
bdnz L(loop0)
- srd r0, r12, r10
- subfe r0, r5, r0
+ subfe r0, r5, r11
mulld r0, r7, r0
std r0, 0(rp)
blr
@@ -102,14 +103,15 @@ L(loop0):
ALIGN(16)
L(norm):
mulld r11, r12, r7
+ mulhdu r5, r11, d
std r11, 0(rp)
ALIGN(16)
L(loop1):
- mulhdu r5, r11, d
ld r9, 8(up)
addi up, up, 8
subfe r5, r5, r9
mulld r11, r7, r5
+ mulhdu r5, r11, d C result not used
std r11, 8(rp)
addi rp, rp, 8
bdnz L(loop1)
diff --git a/mpn/powerpc64/mode64/divrem_2.asm b/mpn/powerpc64/mode64/divrem_2.asm
index 369b5c1f1..53ef1c708 100644
--- a/mpn/powerpc64/mode64/divrem_2.asm
+++ b/mpn/powerpc64/mode64/divrem_2.asm
@@ -22,11 +22,9 @@ include(`../config.m4')
C cycles/limb
C norm frac
C POWER3/PPC630
-C POWER4/PPC970 39* 39*
-C POWER5 39* 39*
-
-C STATUS
-C * Performace fluctuates like crazy
+C POWER4/PPC970 ? ?
+C POWER5 37 ?
+C POWER6 62 ?
C INPUT PARAMETERS
C qp = r3
@@ -121,12 +119,12 @@ L(loop):
mulld r6, r29, r3
addc r6, r6, r31
adde r8, r8, r29
+ cmpd cr7, r27, r25
mulld r0, r30, r8
- subf r31, r0, r31
mulhdu r11, r28, r8
mulld r10, r28, r8
+ subf r31, r0, r31
li r7, 0
- cmpd cr7, r27, r25
blt cr7, L(60)
ld r7, 0(r26)
addi r26, r26, -8
diff --git a/mpn/powerpc64/mode64/mul_1.asm b/mpn/powerpc64/mode64/mul_1.asm
index 8f644d871..12bff2fb6 100644
--- a/mpn/powerpc64/mode64/mul_1.asm
+++ b/mpn/powerpc64/mode64/mul_1.asm
@@ -1,7 +1,7 @@
dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
dnl the result in a second limb vector.
-dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software
+dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2010 Free Software
dnl Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -22,9 +22,10 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C POWER3/PPC630: 6-18
-C POWER4/PPC970: 7.25
-C POWER5: 7.75
+C POWER3/PPC630 6-18
+C POWER4/PPC970 7.25? not updated for last file revision
+C POWER5 7.25
+C POWER6 14
C TODO
C * Try to reduce the number of needed live registers (at least r5 and r10
@@ -118,26 +119,18 @@ L(b10): ld r27, 8(up)
L(top): mulld r0, r26, r6
mulhdu r5, r26, r6
- ld r26, 0(up)
- nop
-
mulld r7, r27, r6
mulhdu r8, r27, r6
+ ld r26, 0(up)
ld r27, 8(up)
- nop
-
adde r0, r0, r12
adde r7, r7, r5
-
mulld r9, r26, r6
mulhdu r10, r26, r6
- ld r26, 16(up)
- nop
-
mulld r11, r27, r6
mulhdu r12, r27, r6
+ ld r26, 16(up)
ld r27, 24(up)
-
std r0, 0(rp)
adde r9, r9, r8
std r7, 8(rp)
@@ -151,13 +144,10 @@ L(top): mulld r0, r26, r6
L(end): mulld r0, r26, r6
mulhdu r5, r26, r6
-
mulld r7, r27, r6
mulhdu r8, r27, r6
-
adde r0, r0, r12
adde r7, r7, r5
-
std r0, 0(rp)
std r7, 8(rp)
L(ret): addze r3, r8