summaryrefslogtreecommitdiff
path: root/mpn/sparc64
diff options
context:
space:
mode:
authorTorbjorn Granlund <tege@gmplib.org>2013-04-30 00:15:07 +0200
committerTorbjorn Granlund <tege@gmplib.org>2013-04-30 00:15:07 +0200
commitf7f0ac7925727ac4b1909a3ec3c314702fed792d (patch)
tree589b64581d284af7947d719a88703e353d8a702f /mpn/sparc64
parent233061b2cb96a5e1a584960772e74da1bfc744c6 (diff)
downloadgmp-f7f0ac7925727ac4b1909a3ec3c314702fed792d.tar.gz
Rewrite SPARC T3 addmul_1.
Diffstat (limited to 'mpn/sparc64')
-rw-r--r--mpn/sparc64/ultrasparct3/addmul_1.asm168
1 files changed, 129 insertions, 39 deletions
diff --git a/mpn/sparc64/ultrasparct3/addmul_1.asm b/mpn/sparc64/ultrasparct3/addmul_1.asm
index 1a494c492..7074244f0 100644
--- a/mpn/sparc64/ultrasparct3/addmul_1.asm
+++ b/mpn/sparc64/ultrasparct3/addmul_1.asm
@@ -1,6 +1,6 @@
-dnl SPARC v9 mpn_addmul_1 for T3/T4.
+dnl SPARC v9 mpn_addmul_1 for T3/T4/T5.
-dnl Contributed to the GNU project by David Miller.
+dnl Contributed to the GNU project by David Miller and Torbjörn Granlund.
dnl Copyright 2013 Free Software Foundation, Inc.
@@ -22,8 +22,8 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C UltraSPARC T3: 28
-C UltraSPARC T4: 5.5
+C UltraSPARC T3: ?
+C UltraSPARC T4: 4.25 hopefully
C INPUT PARAMETERS
define(`rp', `%i0')
@@ -31,51 +31,141 @@ define(`up', `%i1')
define(`n', `%i2')
define(`v0', `%i3')
+define(`u0', `%l0')
+define(`u1', `%l1')
+define(`u2', `%l2')
+define(`u3', `%l3')
+define(`r0', `%l4')
+define(`r1', `%l5')
+define(`r2', `%l6')
+define(`r3', `%l7')
+
ASM_START()
REGISTER(%g2,#scratch)
REGISTER(%g3,#scratch)
PROLOGUE(mpn_addmul_1)
save %sp, -176, %sp
- subcc n, 1, n
- be L(final_one)
- subcc %g0, %g0, %o5
-
-L(top):
- ldx [up+0], %l0
- ldx [up+8], %l1
- ldx [rp+0], %l2
- ldx [rp+8], %l3
- mulx %l0, v0, %o0
+ ldx [up+0], %g1
+
+ and n, 3, %g3
+ brz %g3, L(b0)
+ addcc %g0, %g0, %g5 C clear carry limb, flag
+ cmp %g3, 2
+ bcs %xcc, L(b01)
+ nop
+ be %xcc, L(b10)
+ ldx [up+8], %g5
+
+L(b11): ldx [up+16], u3
+ mulx %g1, v0, %o2
+ umulxhi(%g1, v0, %o3)
+ ldx [rp+0], r1
+ mulx %g5, v0, %o4
+ ldx [rp+8], r2
+ umulxhi(%g5, v0, %o5)
+ ldx [rp+16], r3
+ mulx u3, v0, %g4
+ umulxhi(u3, v0, %g5)
+ addcc %o3, %o4, %o4
+ addxccc(%o5, %g4, %g4)
+ addxc( %g0, %g5, %g5)
+ addcc r1, %o2, r1
+ stx r1, [rp+0]
+ addxccc(r2, %o4, r2)
+ stx r2, [rp+8]
+ addxccc(r3, %g4, r3)
+ stx r3, [rp+16]
+ add n, -3, n
+ add up, 24, up
+ brz n, L(xit)
+ add rp, 24, rp
+ b L(com)
+ nop
+
+L(b10): mulx %g1, v0, %o4
+ ldx [rp+0], r2
+ umulxhi(%g1, v0, %o5)
+ ldx [rp+8], r3
+ mulx %g5, v0, %g4
+ umulxhi(%g5, v0, %g5)
+ addcc %o5, %g4, %g4
+ addxc( %g0, %g5, %g5)
+ addcc r2, %o4, r2
+ stx r2, [rp+0]
+ addxccc(r3, %g4, r3)
+ stx r3, [rp+8]
+ add n, -2, n
add up, 16, up
- umulxhi(%l0, v0, %o1)
- add rp, 16, rp
- mulx %l1, v0, %o2
- sub n, 2, n
- umulxhi(%l1, v0, %o3)
- addxccc(%o5, %o0, %o0)
+ brz n, L(xit)
+ add rp, 16, rp
+ b L(com)
+ nop
+
+L(b01): ldx [rp+0], r3
+ mulx %g1, v0, %g4
+ umulxhi(%g1, v0, %g5)
+ addcc r3, %g4, r3
+ stx r3, [rp+0]
+ add n, -1, n
+ add up, 8, up
+ brz n, L(xit)
+ add rp, 8, rp
+
+L(com): ldx [up+0], %g1
+L(b0): ldx [up+8], u1
+ ldx [up+16], u2
+ ldx [up+24], u3
+ mulx %g1, v0, %o0
+ umulxhi(%g1, v0, %o1)
+ b L(lo0)
+ nop
+
+ ALIGN(16)
+L(top): ldx [up+0], u0
+ addxc( %g0, %g5, %g5) C propagate carry into carry limb
+ ldx [up+8], u1
+ addcc r0, %o0, r0
+ ldx [up+16], u2
+ addxccc(r1, %o2, r1)
+ ldx [up+24], u3
+ addxccc(r2, %o4, r2)
+ stx r0, [rp-32]
+ addxccc(r3, %g4, r3)
+ stx r1, [rp-24]
+ mulx u0, v0, %o0
+ stx r2, [rp-16]
+ umulxhi(u0, v0, %o1)
+ stx r3, [rp-8]
+L(lo0): mulx u1, v0, %o2
+ ldx [rp+0], r0
+ umulxhi(u1, v0, %o3)
+ ldx [rp+8], r1
+ mulx u2, v0, %o4
+ ldx [rp+16], r2
+ umulxhi(u2, v0, %o5)
+ ldx [rp+24], r3
+ mulx u3, v0, %g4
+ addxccc(%g5, %o0, %o0)
+ umulxhi(u3, v0, %g5)
+ add up, 32, up
addxccc(%o1, %o2, %o2)
- addxc( %g0, %o3, %o5)
- addcc %l2, %o0, %o0
- stx %o0, [rp-16]
- addxccc(%l3, %o2, %o2)
+ add rp, 32, rp
+ addxccc(%o3, %o4, %o4)
+ add n, -4, n
+ addxccc(%o5, %g4, %g4)
brgz n, L(top)
- stx %o2, [rp-8]
-
- brlz,pt n, L(done)
nop
-L(final_one):
- ldx [up+0], %l0
- ldx [rp+0], %l2
- mulx %l0, v0, %o0
- umulxhi(%l0, v0, %o1)
- addxccc(%o5, %o0, %o0)
- addxc( %g0, %o1, %o5)
- addcc %l2, %o0, %o0
- stx %o0, [rp+0]
-
-L(done):
- addxc( %g0, %o5, %i0)
+ addxc( %g0, %g5, %g5)
+ addcc r0, %o0, r0
+ stx r0, [rp-32]
+ addxccc(r1, %o2, r1)
+ stx r1, [rp-24]
+ addxccc(r2, %o4, r2)
+ stx r2, [rp-16]
+ addxccc(r3, %g4, r3)
+ stx r3, [rp-8]
+L(xit): addxc( %g0, %g5, %i0)
ret
restore
EPILOGUE()