summaryrefslogtreecommitdiff
path: root/mpn/alpha
diff options
context:
space:
mode:
authorTorbjorn Granlund <tege@gmplib.org>2010-03-15 13:16:46 +0100
committerTorbjorn Granlund <tege@gmplib.org>2010-03-15 13:16:46 +0100
commit3bb04a012836b277deba0f6c222860b7402c9233 (patch)
tree94a85686c8a2816a564f0bb3aa21eb07d0509815 /mpn/alpha
parentd49186780bed0af1a43f0ede9fa72e067e769226 (diff)
downloadgmp-3bb04a012836b277deba0f6c222860b7402c9233.tar.gz
Rewrite alpha mpn_mod_1s_4p_cps.
Diffstat (limited to 'mpn/alpha')
-rw-r--r--mpn/alpha/ev6/mod_1_4.asm210
1 files changed, 103 insertions, 107 deletions
diff --git a/mpn/alpha/ev6/mod_1_4.asm b/mpn/alpha/ev6/mod_1_4.asm
index 4a597be81..af56f42e3 100644
--- a/mpn/alpha/ev6/mod_1_4.asm
+++ b/mpn/alpha/ev6/mod_1_4.asm
@@ -2,7 +2,7 @@ dnl Alpha mpn_mod_1s_4p
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2009 Free Software Foundation, Inc.
+dnl Copyright 2009, 2010 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -24,8 +24,9 @@ include(`../config.m4')
C TODO:
C * Optimise. 2.75 c/l should be possible.
C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated.
-C * Make mpn_mod_1s_4p_cps work for ev4-ev5.
C * Optimise feed-in code, starting the sw pipeline in switch code.
+C * Shorten software pipeline. The mul instructions are scheduled too far
+C from their users.
C * Use fewer registers. Use r28 and r27.
C * If we cannot reduce register usage, write perhaps small-n basecase.
C * Does it work for PIC?
@@ -48,24 +49,19 @@ define(`B4modb', `r4')
define(`B5modb', `r5')
ASM_START()
- .arch ev56
-
PROLOGUE(mpn_mod_1s_4p)
- lda r30, -80(r30)
+ lda r30, -64(r30)
stq r9, 8(r30)
- stq r10, 16(r30)
- stq r11, 24(r30)
- stq r12, 32(r30)
- stq r13, 40(r30)
- stq r14, 48(r30)
- stq r15, 56(r30)
- s8addq n, ap, ap C point ap at vector end
-
ldq B1modb, 16(r19)
+ stq r10, 16(r30)
ldq B2modb, 24(r19)
+ stq r11, 24(r30)
ldq B3modb, 32(r19)
+ stq r12, 32(r30)
ldq B4modb, 40(r19)
+ stq r13, 40(r30)
ldq B5modb, 48(r19)
+ s8addq n, ap, ap C point ap at vector end
and n, 3, r0
lda n, -4(n)
@@ -100,7 +96,7 @@ L(b0): ldq r21, -24(ap)
mulq r22, B2modb, r9
umulh r22, B2modb, r13
mulq r23, B3modb, r10
- umulh r23, B3modb, r14
+ umulh r23, B3modb, r27
addq r8, r20, pl
cmpult pl, r8, r0
addq r0, r12, ph
@@ -110,7 +106,7 @@ L(b0): ldq r21, -24(ap)
addq r0, ph, ph
addq r10, pl, rl
cmpult rl, r10, r0
- addq r14, ph, ph
+ addq r27, ph, ph
addq r0, ph, rh
lda ap, -64(ap)
br L(com)
@@ -141,9 +137,9 @@ L(com): ble n, L(ed3)
mulq r22, B2modb, r9
umulh r22, B2modb, r13
mulq r23, B3modb, r10
- umulh r23, B3modb, r14
+ umulh r23, B3modb, r27
mulq rl, B4modb, r11
- umulh rl, B4modb, r15
+ umulh rl, B4modb, r28
ble n, L(ed2)
ALIGN(16)
@@ -165,21 +161,21 @@ L(top): ldq r21, 8(ap)
addq r10, pl, pl
mulq r22, B2modb, r9
cmpult pl, r10, r0
- addq r14, ph, ph
+ addq r27, ph, ph
addq r11, pl, pl
umulh r22, B2modb, r13
addq r0, ph, ph
cmpult pl, r11, r0
- addq r15, ph, ph
+ addq r28, ph, ph
mulq r23, B3modb, r10
ldq r20, 32(ap)
addq pl, rl, rl
- umulh r23, B3modb, r14
+ umulh r23, B3modb, r27
addq r0, ph, ph
cmpult rl, pl, r0
mulq rl, B4modb, r11
addq ph, rh, rh
- umulh rl, B4modb, r15
+ umulh rl, B4modb, r28
addq r0, rh, rh
lda n, -4(n)
bgt n, L(top)
@@ -195,11 +191,11 @@ L(ed2): mulq rh, B5modb, rl
addq r0, ph, ph
addq r10, pl, pl
cmpult pl, r10, r0
- addq r14, ph, ph
+ addq r27, ph, ph
addq r11, pl, pl
addq r0, ph, ph
cmpult pl, r11, r0
- addq r15, ph, ph
+ addq r28, ph, ph
addq pl, rl, rl
addq r0, ph, ph
cmpult rl, pl, r0
@@ -245,92 +241,92 @@ L(ed3): mulq rh, B1modb, r8
ldq r11, 24(r30)
ldq r12, 32(r30)
ldq r13, 40(r30)
- ldq r14, 48(r30)
- ldq r15, 56(r30)
- lda r30, 80(r30)
+ lda r30, 64(r30)
ret r31, (r26), 1
EPILOGUE()
PROLOGUE(mpn_mod_1s_4p_cps,gp)
- ldgp r29, 0(r27)
- LEA( r28, __clz_tab)
- lda r30, -32(r30)
- lda r5, 65(r31)
- cmpbge r31, r17, r8
- stq r26, 0(r30)
- stq r10, 16(r30)
- srl r8, 1, r7
- xor r7, 127, r6
- stq r11, 24(r30)
- stq r9, 8(r30)
- bis r31, r16, r11
- addq r6, r28, r4
- ldbu r2, 0(r4)
- s8subq r2, 7, r3
- srl r17, r3, r27
- subq r5, r3, r26
- addq r27, r28, r10
- ldbu r9, 0(r10)
- subq r26, r9, r10
- sll r17, r10, r9
- bis r31, r9, r16
- jsr r26, mpn_invert_limb
- ldgp r29, 0(r26)
- stq r10, 8(r11)
- subq r31, r10, r25
- lda r24, 1(r31)
- subq r31, r9, r20
- stq r0, 0(r11)
- srl r0, r25, r22
- sll r24, r10, r23
- bis r22, r23, r21
- mulq r20, r21, r1
- umulh r1, r0, r18
- srl r1, r10, r19
- mulq r1, r0, r8
- stq r19, 16(r11)
- addq r18, r1, r17
- ornot r31, r17, r16
- mulq r16, r9, r2
- cmpule r2, r8, r7
- addq r2, r9, r6
- cmoveq r7, r6, r2
- umulh r2, r0, r4
- srl r2, r10, r5
- mulq r2, r0, r27
- stq r5, 24(r11)
- addq r4, r2, r3
- ornot r31, r3, r28
- mulq r28, r9, r23
- cmpule r23, r27, r26
- addq r23, r9, r25
- cmoveq r26, r25, r23
- ldq r26, 0(r30)
- umulh r23, r0, r22
- srl r23, r10, r24
- mulq r23, r0, r19
- stq r24, 32(r11)
- addq r22, r23, r21
- ornot r31, r21, r20
- mulq r20, r9, r1
- addq r1, r9, r17
- cmpule r1, r19, r18
- cmoveq r18, r17, r1
- umulh r1, r0, r8
- srl r1, r10, r16
- mulq r1, r0, r5
- stq r16, 40(r11)
- addq r8, r1, r7
- ornot r31, r7, r6
- mulq r6, r9, r2
- addq r2, r9, r3
- cmpule r2, r5, r4
- ldq r9, 8(r30)
- cmoveq r4, r3, r2
- srl r2, r10, r0
- ldq r10, 16(r30)
- stq r0, 48(r11)
- ldq r11, 24(r30)
- lda r30, 32(r30)
- ret r31, (r26), 1
+ lda r30, -32(r30)
+ stq r26, 0(r30)
+ stq r9, 8(r30)
+ stq r10, 16(r30)
+ stq r11, 24(r30)
+ mov r16, r11
+ LEA( r4, __clz_tab)
+ lda r10, 65(r31)
+ cmpbge r31, r17, r1
+ srl r1, 1, r1
+ xor r1, 127, r1
+ addq r1, r4, r1
+ ldq_u r2, 0(r1)
+ extbl r2, r1, r2
+ s8subq r2, 7, r2
+ srl r17, r2, r3
+ subq r10, r2, r10
+ addq r3, r4, r3
+ ldq_u r1, 0(r3)
+ extbl r1, r3, r1
+ subq r10, r1, r10
+ sll r17, r10, r9
+ mov r9, r16
+ jsr r26, mpn_invert_limb
+ ldah r29, 0(r26)
+ subq r31, r10, r2
+ lda r1, 1(r31)
+ sll r1, r10, r1
+ subq r31, r9, r3
+ srl r0, r2, r2
+ ldq r26, 0(r30)
+ bis r2, r1, r2
+ lda r29, 0(r29)
+ stq r0, 0(r11)
+ stq r10, 8(r11)
+ mulq r2, r3, r2
+ srl r2, r10, r3
+ umulh r2, r0, r1
+ stq r3, 16(r11)
+ mulq r2, r0, r3
+ ornot r31, r1, r1
+ subq r1, r2, r1
+ mulq r1, r9, r1
+ addq r1, r9, r2
+ cmpule r1, r3, r3
+ cmoveq r3, r2, r1
+ srl r1, r10, r3
+ umulh r1, r0, r2
+ stq r3, 24(r11)
+ mulq r1, r0, r3
+ ornot r31, r2, r2
+ subq r2, r1, r2
+ mulq r2, r9, r2
+ addq r2, r9, r1
+ cmpule r2, r3, r3
+ cmoveq r3, r1, r2
+ srl r2, r10, r1
+ umulh r2, r0, r3
+ stq r1, 32(r11)
+ mulq r2, r0, r1
+ ornot r31, r3, r3
+ subq r3, r2, r3
+ mulq r3, r9, r3
+ addq r3, r9, r2
+ cmpule r3, r1, r1
+ cmoveq r1, r2, r3
+ srl r3, r10, r2
+ umulh r3, r0, r1
+ stq r2, 40(r11)
+ mulq r3, r0, r0
+ ornot r31, r1, r1
+ subq r1, r3, r1
+ mulq r1, r9, r1
+ addq r1, r9, r9
+ cmpule r1, r0, r0
+ cmoveq r0, r9, r1
+ ldq r9, 8(r30)
+ srl r1, r10, r1
+ ldq r10, 16(r30)
+ stq r1, 48(r11)
+ ldq r11, 24(r30)
+ lda r30, 32(r30)
+ ret r31, (r26), 1
EPILOGUE()