summaryrefslogtreecommitdiff
path: root/mpn/alpha
diff options
context:
space:
mode:
authorTorbjorn Granlund <tege@gmplib.org>2010-03-10 01:54:25 +0100
committerTorbjorn Granlund <tege@gmplib.org>2010-03-10 01:54:25 +0100
commit3ef6f81b6a9190128aacf2ea193f19e718b589b5 (patch)
treeb137c4d7241cbe08af3c7b3bf8ee69a309885185 /mpn/alpha
parent1fad060b5b5524722a3ee798b7ed570cab375603 (diff)
downloadgmp-3ef6f81b6a9190128aacf2ea193f19e718b589b5.tar.gz
Natively support alpha/ev6 mpn_mod_1s_4p.
Diffstat (limited to 'mpn/alpha')
-rw-r--r--mpn/alpha/ev6/mod_1_4.asm336
1 files changed, 336 insertions, 0 deletions
diff --git a/mpn/alpha/ev6/mod_1_4.asm b/mpn/alpha/ev6/mod_1_4.asm
new file mode 100644
index 000000000..4a597be81
--- /dev/null
+++ b/mpn/alpha/ev6/mod_1_4.asm
@@ -0,0 +1,336 @@
+dnl Alpha mpn_mod_1s_4p
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Optimise. 2.75 c/l should be possible.
+C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated.
+C * Make mpn_mod_1s_4p_cps work for ev4-ev5.
+C * Optimise feed-in code, starting the sw pipeline in switch code.
+C * Use fewer registers. Use r28 and r27.
+C * If we cannot reduce register usage, write perhaps small-n basecase.
+C * Does it work for PIC?
+
+C cycles/limb
+C EV4: ?
+C EV5: 23
+C EV6: 3
+
+define(`ap', `r16')
+define(`n', `r17')
+define(`pl', `r24')
+define(`ph', `r25')
+define(`rl', `r6')
+define(`rh', `r7')
+define(`B1modb', `r1')
+define(`B2modb', `r2')
+define(`B3modb', `r3')
+define(`B4modb', `r4')
+define(`B5modb', `r5')
+
+ASM_START()
+ .arch ev56
+
+PROLOGUE(mpn_mod_1s_4p)
+ lda r30, -80(r30)
+ stq r9, 8(r30)
+ stq r10, 16(r30)
+ stq r11, 24(r30)
+ stq r12, 32(r30)
+ stq r13, 40(r30)
+ stq r14, 48(r30)
+ stq r15, 56(r30)
+ s8addq n, ap, ap C point ap at vector end
+
+ ldq B1modb, 16(r19)
+ ldq B2modb, 24(r19)
+ ldq B3modb, 32(r19)
+ ldq B4modb, 40(r19)
+ ldq B5modb, 48(r19)
+
+ and n, 3, r0
+ lda n, -4(n)
+ beq r0, L(b0)
+ lda r6, -2(r0)
+ blt r6, L(b1)
+ beq r6, L(b2)
+
+L(b3): ldq r21, -16(ap)
+ ldq r22, -8(ap)
+ ldq r20, -24(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ addq r8, r20, pl
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, rl
+ cmpult rl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, rh
+ lda ap, -56(ap)
+ br L(com)
+
+L(b0): ldq r21, -24(ap)
+ ldq r22, -16(ap)
+ ldq r23, -8(ap)
+ ldq r20, -32(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ mulq r23, B3modb, r10
+ umulh r23, B3modb, r14
+ addq r8, r20, pl
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, pl
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, ph
+ addq r10, pl, rl
+ cmpult rl, r10, r0
+ addq r14, ph, ph
+ addq r0, ph, rh
+ lda ap, -64(ap)
+ br L(com)
+
+L(b1): bis r31, r31, rh
+ ldq rl, -8(ap)
+ lda ap, -40(ap)
+ br L(com)
+
+L(b2): ldq r21, -8(ap)
+ ldq r20, -16(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ addq r8, r20, rl
+ cmpult rl, r8, r0
+ addq r0, r12, rh
+ lda ap, -48(ap)
+
+L(com): ble n, L(ed3)
+ ldq r21, 8(ap)
+ ldq r22, 16(ap)
+ ldq r23, 24(ap)
+ ldq r20, 0(ap)
+ lda n, -4(n)
+ lda ap, -32(ap)
+ mulq r21, B1modb, r8
+ umulh r21, B1modb, r12
+ mulq r22, B2modb, r9
+ umulh r22, B2modb, r13
+ mulq r23, B3modb, r10
+ umulh r23, B3modb, r14
+ mulq rl, B4modb, r11
+ umulh rl, B4modb, r15
+ ble n, L(ed2)
+
+ ALIGN(16)
+L(top): ldq r21, 8(ap)
+ mulq rh, B5modb, rl
+ addq r8, r20, pl
+ ldq r22, 16(ap)
+ cmpult pl, r8, r0
+ umulh rh, B5modb, rh
+ ldq r23, 24(ap)
+ addq r0, r12, ph
+ addq r9, pl, pl
+ mulq r21, B1modb, r8
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ umulh r21, B1modb, r12
+ lda ap, -32(ap)
+ addq r0, ph, ph
+ addq r10, pl, pl
+ mulq r22, B2modb, r9
+ cmpult pl, r10, r0
+ addq r14, ph, ph
+ addq r11, pl, pl
+ umulh r22, B2modb, r13
+ addq r0, ph, ph
+ cmpult pl, r11, r0
+ addq r15, ph, ph
+ mulq r23, B3modb, r10
+ ldq r20, 32(ap)
+ addq pl, rl, rl
+ umulh r23, B3modb, r14
+ addq r0, ph, ph
+ cmpult rl, pl, r0
+ mulq rl, B4modb, r11
+ addq ph, rh, rh
+ umulh rl, B4modb, r15
+ addq r0, rh, rh
+ lda n, -4(n)
+ bgt n, L(top)
+
+L(ed2): mulq rh, B5modb, rl
+ addq r8, r20, pl
+ umulh rh, B5modb, rh
+ cmpult pl, r8, r0
+ addq r0, r12, ph
+ addq r9, pl, pl
+ cmpult pl, r9, r0
+ addq r13, ph, ph
+ addq r0, ph, ph
+ addq r10, pl, pl
+ cmpult pl, r10, r0
+ addq r14, ph, ph
+ addq r11, pl, pl
+ addq r0, ph, ph
+ cmpult pl, r11, r0
+ addq r15, ph, ph
+ addq pl, rl, rl
+ addq r0, ph, ph
+ cmpult rl, pl, r0
+ addq ph, rh, rh
+ addq r0, rh, rh
+
+L(ed3): mulq rh, B1modb, r8
+ umulh rh, B1modb, rh
+ addq r8, rl, rl
+ cmpult rl, r8, r0
+ addq r0, rh, rh
+
+ ldq r24, 8(r19) C cnt
+ sll rh, r24, rh
+ subq r31, r24, r25
+ srl rl, r25, r2
+ sll rl, r24, rl
+ or r2, rh, rh
+
+ ldq r23, 0(r19) C bi
+ mulq rh, r23, r8
+ umulh rh, r23, r9
+ addq rh, 1, r7
+ addq r8, rl, r8 C ql
+ cmpult r8, rl, r0
+ addq r9, r7, r9
+ addq r0, r9, r9 C qh
+ mulq r9, r18, r21 C qh * b
+ subq rl, r21, rl
+ cmpult r8, rl, r0 C rl > ql
+ negq r0, r0
+ and r0, r18, r0
+ addq rl, r0, rl
+ cmpule r18, rl, r0 C rl >= b
+ negq r0, r0
+ and r0, r18, r0
+ subq rl, r0, rl
+
+ srl rl, r24, r0
+
+ ldq r9, 8(r30)
+ ldq r10, 16(r30)
+ ldq r11, 24(r30)
+ ldq r12, 32(r30)
+ ldq r13, 40(r30)
+ ldq r14, 48(r30)
+ ldq r15, 56(r30)
+ lda r30, 80(r30)
+ ret r31, (r26), 1
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_4p_cps,gp)
+ ldgp r29, 0(r27)
+ LEA( r28, __clz_tab)
+ lda r30, -32(r30)
+ lda r5, 65(r31)
+ cmpbge r31, r17, r8
+ stq r26, 0(r30)
+ stq r10, 16(r30)
+ srl r8, 1, r7
+ xor r7, 127, r6
+ stq r11, 24(r30)
+ stq r9, 8(r30)
+ bis r31, r16, r11
+ addq r6, r28, r4
+ ldbu r2, 0(r4)
+ s8subq r2, 7, r3
+ srl r17, r3, r27
+ subq r5, r3, r26
+ addq r27, r28, r10
+ ldbu r9, 0(r10)
+ subq r26, r9, r10
+ sll r17, r10, r9
+ bis r31, r9, r16
+ jsr r26, mpn_invert_limb
+ ldgp r29, 0(r26)
+ stq r10, 8(r11)
+ subq r31, r10, r25
+ lda r24, 1(r31)
+ subq r31, r9, r20
+ stq r0, 0(r11)
+ srl r0, r25, r22
+ sll r24, r10, r23
+ bis r22, r23, r21
+ mulq r20, r21, r1
+ umulh r1, r0, r18
+ srl r1, r10, r19
+ mulq r1, r0, r8
+ stq r19, 16(r11)
+ addq r18, r1, r17
+ ornot r31, r17, r16
+ mulq r16, r9, r2
+ cmpule r2, r8, r7
+ addq r2, r9, r6
+ cmoveq r7, r6, r2
+ umulh r2, r0, r4
+ srl r2, r10, r5
+ mulq r2, r0, r27
+ stq r5, 24(r11)
+ addq r4, r2, r3
+ ornot r31, r3, r28
+ mulq r28, r9, r23
+ cmpule r23, r27, r26
+ addq r23, r9, r25
+ cmoveq r26, r25, r23
+ ldq r26, 0(r30)
+ umulh r23, r0, r22
+ srl r23, r10, r24
+ mulq r23, r0, r19
+ stq r24, 32(r11)
+ addq r22, r23, r21
+ ornot r31, r21, r20
+ mulq r20, r9, r1
+ addq r1, r9, r17
+ cmpule r1, r19, r18
+ cmoveq r18, r17, r1
+ umulh r1, r0, r8
+ srl r1, r10, r16
+ mulq r1, r0, r5
+ stq r16, 40(r11)
+ addq r8, r1, r7
+ ornot r31, r7, r6
+ mulq r6, r9, r2
+ addq r2, r9, r3
+ cmpule r2, r5, r4
+ ldq r9, 8(r30)
+ cmoveq r4, r3, r2
+ srl r2, r10, r0
+ ldq r10, 16(r30)
+ stq r0, 48(r11)
+ ldq r11, 24(r30)
+ lda r30, 32(r30)
+ ret r31, (r26), 1
+EPILOGUE()