diff options
author | Torbjorn Granlund <tege@gmplib.org> | 2010-04-21 21:02:49 +0200 |
---|---|---|
committer | Torbjorn Granlund <tege@gmplib.org> | 2010-04-21 21:02:49 +0200 |
commit | eb76cf0171a29adbb785aaf4987c2a36c7a8bf7c (patch) | |
tree | 3a200cae605a9c3f74a74cc04c0a0b8f62680270 /mpn/powerpc64 | |
parent | ef6a7533b04cb10e7f96e3c24bc33a4d9cbce943 (diff) | |
download | gmp-eb76cf0171a29adbb785aaf4987c2a36c7a8bf7c.tar.gz |
Swap multiply insns to make them consecutive, for the benefit of POWER6.
Diffstat (limited to 'mpn/powerpc64')
-rw-r--r-- | mpn/powerpc64/mode64/aorsmul_1.asm | 23 | ||||
-rw-r--r-- | mpn/powerpc64/mode64/dive_1.asm | 28 | ||||
-rw-r--r-- | mpn/powerpc64/mode64/divrem_2.asm | 12 | ||||
-rw-r--r-- | mpn/powerpc64/mode64/mul_1.asm | 24 |
4 files changed, 38 insertions, 49 deletions
diff --git a/mpn/powerpc64/mode64/aorsmul_1.asm b/mpn/powerpc64/mode64/aorsmul_1.asm index 939b2fe4d..f7ac9f002 100644 --- a/mpn/powerpc64/mode64/aorsmul_1.asm +++ b/mpn/powerpc64/mode64/aorsmul_1.asm @@ -22,9 +22,10 @@ include(`../config.m4') C mpn_addmul_1 mpn_submul_1 C cycles/limb cycles/limb -C POWER3/PPC630: 6-18 6-18 -C POWER4/PPC970: 8 8.3 -C POWER5: 8 ? +C POWER3/PPC630 6-18 6-18 +C POWER4/PPC970 8? 8.3? not updated for last file revision +C POWER5 8 8.75 +C POWER6 16 16.5 C TODO C * Try to reduce the number of needed live registers @@ -118,10 +119,10 @@ L(gt1): ld r9, 0(up) ld r27, 8(up) mulld r0, r9, r6 mulhdu r5, r9, r6 - ld r9, 16(up) - ld r28, 0(rp) mulld r7, r27, r6 mulhdu r8, r27, r6 + ld r9, 16(up) + ld r28, 0(rp) ld r29, 8(rp) ld r30, 16(rp) mulld r11, r9, r6 @@ -151,20 +152,20 @@ L(b10): addic r0, r0, 0 ALIGN(16) L(top): mulld r0, r9, r6 mulhdu r5, r9, r6 C 9 - ld r9, 0(up) - ld r28, 0(rp) mulld r7, r27, r6 mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) ld r27, 8(up) ld r29, 8(rp) adde r0, r0, r12 C 0 12 adde r7, r7, r5 C 5 7 mulld r5, r9, r6 mulhdu r10, r9, r6 C 9 - ld r9, 16(up) - ld r30, 16(rp) mulld r11, r27, r6 mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) ld r27, 24(up) ld r31, 24(rp) adde r5, r5, r8 C 8 5 @@ -185,12 +186,10 @@ L(bot): INVCY(r11) L(end): mulld r0, r9, r6 mulhdu r5, r9, r6 - ld r28, 0(rp) - nop mulld r7, r27, r6 mulhdu r8, r27, r6 + ld r28, 0(rp) ld r29, 8(rp) - nop adde r0, r0, r12 adde r7, r7, r5 addze r8, r8 diff --git a/mpn/powerpc64/mode64/dive_1.asm b/mpn/powerpc64/mode64/dive_1.asm index 1f482bae5..d457d65e9 100644 --- a/mpn/powerpc64/mode64/dive_1.asm +++ b/mpn/powerpc64/mode64/dive_1.asm @@ -1,6 +1,6 @@ dnl PowerPC-64 mpn_divexact_1 -- mpn by limb exact division. -dnl Copyright 2006 Free Software Foundation, Inc. +dnl Copyright 2006, 2010 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -19,14 +19,15 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C POWER3/PPC630: 13-19 -C POWER4/PPC970: 16 -C POWER5: 16 +C cycles/limb +C norm unorm +C POWER3/PPC630 13-19 +C POWER4/PPC970 16 +C POWER5 16 16 +C POWER6 37 46 C TODO C * Check if n=1 code is really an improvement. It probably isn't. -C * Perhaps remove L(norm) code, it is currently unreachable. C * Make more similar to mode1o.asm. C INPUT PARAMETERS @@ -61,7 +62,6 @@ L(7): mtctr n LEA( r5, binvert_limb_table) rldicl r11, d, 63, 57 -C cmpdi cr7, r0, 0 lbzx r0, r5, r11 mulld r9, r0, r0 sldi r0, r0, 1 @@ -75,26 +75,27 @@ C cmpdi cr7, r0, 0 sldi r0, r0, 1 mulld r9, d, r9 subf r7, r9, r0 C r7 = 1/d mod 2^64 -C beq cr7, L(norm) + bne cr0, L(norm) subfic r8, r10, 64 C set carry as side effect li r5, 0 + srd r11, r12, r10 ALIGN(16) L(loop0): - srd r11, r12, r10 ld r12, 8(up) + nop addi up, up, 8 sld r0, r12, r8 or r11, r11, r0 subfe r9, r5, r11 + srd r11, r12, r10 mulld r0, r7, r9 + mulhdu r5, r0, d std r0, 0(rp) addi rp, rp, 8 - mulhdu r5, r0, d bdnz L(loop0) - srd r0, r12, r10 - subfe r0, r5, r0 + subfe r0, r5, r11 mulld r0, r7, r0 std r0, 0(rp) blr @@ -102,14 +103,15 @@ L(loop0): ALIGN(16) L(norm): mulld r11, r12, r7 + mulhdu r5, r11, d std r11, 0(rp) ALIGN(16) L(loop1): - mulhdu r5, r11, d ld r9, 8(up) addi up, up, 8 subfe r5, r5, r9 mulld r11, r7, r5 + mulhdu r5, r11, d C result not used std r11, 8(rp) addi rp, rp, 8 bdnz L(loop1) diff --git a/mpn/powerpc64/mode64/divrem_2.asm b/mpn/powerpc64/mode64/divrem_2.asm index 369b5c1f1..53ef1c708 100644 --- a/mpn/powerpc64/mode64/divrem_2.asm +++ b/mpn/powerpc64/mode64/divrem_2.asm @@ -22,11 +22,9 @@ include(`../config.m4') C cycles/limb C norm frac C POWER3/PPC630 -C POWER4/PPC970 39* 39* -C POWER5 39* 39* - -C STATUS -C * Performace fluctuates like crazy +C POWER4/PPC970 ? ? +C POWER5 37 ? +C POWER6 62 ? C INPUT PARAMETERS C qp = r3 @@ -121,12 +119,12 @@ L(loop): mulld r6, r29, r3 addc r6, r6, r31 adde r8, r8, r29 + cmpd cr7, r27, r25 mulld r0, r30, r8 - subf r31, r0, r31 mulhdu r11, r28, r8 mulld r10, r28, r8 + subf r31, r0, r31 li r7, 0 - cmpd cr7, r27, r25 blt cr7, L(60) ld r7, 0(r26) addi r26, r26, -8 diff --git a/mpn/powerpc64/mode64/mul_1.asm b/mpn/powerpc64/mode64/mul_1.asm index 8f644d871..12bff2fb6 100644 --- a/mpn/powerpc64/mode64/mul_1.asm +++ b/mpn/powerpc64/mode64/mul_1.asm @@ -1,7 +1,7 @@ dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store dnl the result in a second limb vector. -dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2010 Free Software dnl Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -22,9 +22,10 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C POWER3/PPC630: 6-18 -C POWER4/PPC970: 7.25 -C POWER5: 7.75 +C POWER3/PPC630 6-18 +C POWER4/PPC970 7.25? not updated for last file revision +C POWER5 7.25 +C POWER6 14 C TODO C * Try to reduce the number of needed live registers (at least r5 and r10 @@ -118,26 +119,18 @@ L(b10): ld r27, 8(up) L(top): mulld r0, r26, r6 mulhdu r5, r26, r6 - ld r26, 0(up) - nop - mulld r7, r27, r6 mulhdu r8, r27, r6 + ld r26, 0(up) ld r27, 8(up) - nop - adde r0, r0, r12 adde r7, r7, r5 - mulld r9, r26, r6 mulhdu r10, r26, r6 - ld r26, 16(up) - nop - mulld r11, r27, r6 mulhdu r12, r27, r6 + ld r26, 16(up) ld r27, 24(up) - std r0, 0(rp) adde r9, r9, r8 std r7, 8(rp) @@ -151,13 +144,10 @@ L(top): mulld r0, r26, r6 L(end): mulld r0, r26, r6 mulhdu r5, r26, r6 - mulld r7, r27, r6 mulhdu r8, r27, r6 - adde r0, r0, r12 adde r7, r7, r5 - std r0, 0(rp) std r7, 8(rp) L(ret): addze r3, r8 |