diff options
author | tege <tege@gmplib.org> | 2000-03-17 07:08:03 +0100 |
---|---|---|
committer | tege <tege@gmplib.org> | 2000-03-17 07:08:03 +0100 |
commit | 106b678e54ca07182002077f4421890710626aa4 (patch) | |
tree | d3fe95a81926ddd8bc57a083ec50485ffaffee65 /mpn/alpha | |
parent | ef6177983d51165d7e0ef15a52b63fea72169805 (diff) | |
download | gmp-106b678e54ca07182002077f4421890710626aa4.tar.gz |
* Convert `.s' files to `.asm'.
* Port to Cray T3D/E systems.
Diffstat (limited to 'mpn/alpha')
-rw-r--r-- | mpn/alpha/README | 10 | ||||
-rw-r--r-- | mpn/alpha/add_n.asm | 114 | ||||
-rw-r--r-- | mpn/alpha/add_n.s | 120 | ||||
-rw-r--r-- | mpn/alpha/addmul_1.asm | 87 | ||||
-rw-r--r-- | mpn/alpha/addmul_1.s | 92 | ||||
-rw-r--r-- | mpn/alpha/cntlz.s | 70 | ||||
-rw-r--r-- | mpn/alpha/default.m4 | 56 | ||||
-rw-r--r-- | mpn/alpha/ev5/add_n.asm | 143 | ||||
-rw-r--r-- | mpn/alpha/ev5/add_n.s | 148 | ||||
-rw-r--r-- | mpn/alpha/ev5/lshift.asm | 169 | ||||
-rw-r--r-- | mpn/alpha/ev5/lshift.s | 174 | ||||
-rw-r--r-- | mpn/alpha/ev5/rshift.asm | 167 | ||||
-rw-r--r-- | mpn/alpha/ev5/rshift.s | 172 | ||||
-rw-r--r-- | mpn/alpha/ev5/sub_n.asm | 143 | ||||
-rw-r--r-- | mpn/alpha/ev5/sub_n.s | 148 | ||||
-rw-r--r-- | mpn/alpha/invert_limb.asm (renamed from mpn/alpha/invert-limb.s) | 171 | ||||
-rw-r--r-- | mpn/alpha/lshift.asm | 104 | ||||
-rw-r--r-- | mpn/alpha/lshift.s | 109 | ||||
-rw-r--r-- | mpn/alpha/mul_1.asm | 71 | ||||
-rw-r--r-- | mpn/alpha/mul_1.s | 85 | ||||
-rw-r--r-- | mpn/alpha/rshift.asm | 102 | ||||
-rw-r--r-- | mpn/alpha/rshift.s | 107 | ||||
-rw-r--r-- | mpn/alpha/sub_n.asm | 114 | ||||
-rw-r--r-- | mpn/alpha/sub_n.s | 120 | ||||
-rw-r--r-- | mpn/alpha/submul_1.asm | 87 | ||||
-rw-r--r-- | mpn/alpha/submul_1.s | 92 | ||||
-rw-r--r-- | mpn/alpha/unicos.m4 | 41 |
27 files changed, 1489 insertions, 1527 deletions
diff --git a/mpn/alpha/README b/mpn/alpha/README index fd9f78c79..a03153922 100644 --- a/mpn/alpha/README +++ b/mpn/alpha/README @@ -1,5 +1,15 @@ This directory contains mpn functions optimized for DEC Alpha processors. +ALPHA ASSEMBLY RULES AND REGULATIONS + +The `.prologue N' pseudo op marks the end of instruction that needs +special handling by unwinding. It also says whether $27 is really +needed for computing the gp. The `.mask M' pseudo op says which +registers are saved on the stack, and at what offset in the frame. + +Cray code is very very different... + + RELEVANT OPTIMIZATION ISSUES EV4 diff --git a/mpn/alpha/add_n.asm b/mpn/alpha/add_n.asm new file mode 100644 index 000000000..1abfd2d42 --- /dev/null +++ b/mpn/alpha/add_n.asm @@ -0,0 +1,114 @@ +dnl Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_add_n) + ldq r3,0(r17) + ldq r4,0(r18) + + subq r19,1,r19 + and r19,4-1,r2 C number of limbs in first loop + bis r31,r31,r0 + beq r2,$L0 C if multiple of 4 limbs, skip first loop + + subq r19,r2,r19 + +$Loop0: subq r2,1,r2 + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + + addq r17,8,r17 + addq r18,8,r18 + bis r5,r5,r3 + bis r6,r6,r4 + addq r16,8,r16 + bne r2,$Loop0 + +$L0: beq r19,$Lend + + ALIGN(8) +$Loop: subq r19,4,r19 + + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + + ldq r3,16(r17) + addq r6,r0,r6 + ldq r4,16(r18) + cmpult r6,r0,r1 + addq r5,r6,r6 + cmpult r6,r5,r0 + stq r6,8(r16) + bis r0,r1,r0 + + ldq r5,24(r17) + addq r4,r0,r4 + ldq r6,24(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,16(r16) + bis r0,r1,r0 + + ldq r3,32(r17) + addq r6,r0,r6 + ldq r4,32(r18) + cmpult r6,r0,r1 + addq r5,r6,r6 + cmpult r6,r5,r0 + stq r6,24(r16) + bis r0,r1,r0 + + addq r17,32,r17 + addq r18,32,r18 + addq r16,32,r16 + bne r19,$Loop + +$Lend: addq r4,r0,r4 + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + ret r31,(r26),1 +EPILOGUE(mpn_add_n) +ASM_END() diff --git a/mpn/alpha/add_n.s b/mpn/alpha/add_n.s deleted file mode 100644 index 426556e39..000000000 --- a/mpn/alpha/add_n.s +++ /dev/null @@ -1,120 +0,0 @@ - # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and - # store sum in a third limb vector. - - # Copyright (C) 1995 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - - # INPUT PARAMETERS - # res_ptr $16 - # s1_ptr $17 - # s2_ptr $18 - # size $19 - - .set noreorder - .set noat -.text - .align 3 - .globl __mpn_add_n - .ent __mpn_add_n -__mpn_add_n: - .frame $30,0,$26,0 - - ldq $3,0($17) - ldq $4,0($18) - - subq $19,1,$19 - and $19,4-1,$2 # number of limbs in first loop - bis $31,$31,$0 - beq $2,.L0 # if multiple of 4 limbs, skip first loop - - subq $19,$2,$19 - -.Loop0: subq $2,1,$2 - ldq $5,8($17) - addq $4,$0,$4 - ldq $6,8($18) - cmpult $4,$0,$1 - addq $3,$4,$4 - cmpult $4,$3,$0 - stq $4,0($16) - or $0,$1,$0 - - addq $17,8,$17 - addq $18,8,$18 - bis $5,$5,$3 - bis $6,$6,$4 - addq $16,8,$16 - bne $2,.Loop0 - -.L0: beq $19,.Lend - - .align 3 -.Loop: subq $19,4,$19 - - ldq $5,8($17) - addq $4,$0,$4 - ldq $6,8($18) - cmpult $4,$0,$1 - addq $3,$4,$4 - cmpult $4,$3,$0 - stq $4,0($16) - or $0,$1,$0 - - ldq $3,16($17) - addq $6,$0,$6 - ldq $4,16($18) - cmpult $6,$0,$1 - addq $5,$6,$6 - cmpult $6,$5,$0 - stq $6,8($16) - or $0,$1,$0 - - ldq $5,24($17) - addq $4,$0,$4 - ldq $6,24($18) - cmpult $4,$0,$1 - addq $3,$4,$4 - cmpult $4,$3,$0 - stq $4,16($16) - or $0,$1,$0 - - ldq $3,32($17) - addq $6,$0,$6 - ldq $4,32($18) - cmpult $6,$0,$1 - addq $5,$6,$6 - cmpult $6,$5,$0 - stq $6,24($16) - or $0,$1,$0 - - addq $17,32,$17 - addq $18,32,$18 - addq $16,32,$16 - bne $19,.Loop - -.Lend: addq $4,$0,$4 - cmpult $4,$0,$1 - addq $3,$4,$4 - cmpult $4,$3,$0 - stq $4,0($16) - or $0,$1,$0 - ret $31,($26),1 - - .end __mpn_add_n diff --git a/mpn/alpha/addmul_1.asm b/mpn/alpha/addmul_1.asm new file mode 100644 index 000000000..0a42326b3 --- /dev/null +++ b/mpn/alpha/addmul_1.asm @@ -0,0 +1,87 @@ +dnl Alpha __mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published +dnl by the Free Software Foundation; either version 2 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_addmul_1) + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + addq r5,r3,r3 + cmpult r3,r5,r4 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + addq r5,r0,r0 C combine carries + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r5,r0,r0 C combine carries + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r0,r5,r0 + ret r31,(r26),1 +EPILOGUE(mpn_addmul_1) +ASM_END() diff --git a/mpn/alpha/addmul_1.s b/mpn/alpha/addmul_1.s deleted file mode 100644 index 8513c13f5..000000000 --- a/mpn/alpha/addmul_1.s +++ /dev/null @@ -1,92 +0,0 @@ - # Alpha __mpn_addmul_1 -- Multiply a limb vector with a limb and add - # the result to a second limb vector. - - # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - - # INPUT PARAMETERS - # res_ptr r16 - # s1_ptr r17 - # size r18 - # s2_limb r19 - - # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5. - - .set noreorder - .set noat -.text - .align 3 - .globl __mpn_addmul_1 - .ent __mpn_addmul_1 2 -__mpn_addmul_1: - .frame $30,0,$26 - - ldq $2,0($17) # $2 = s1_limb - addq $17,8,$17 # s1_ptr++ - subq $18,1,$18 # size-- - mulq $2,$19,$3 # $3 = prod_low - ldq $5,0($16) # $5 = *res_ptr - umulh $2,$19,$0 # $0 = prod_high - beq $18,.Lend1 # jump if size was == 1 - ldq $2,0($17) # $2 = s1_limb - addq $17,8,$17 # s1_ptr++ - subq $18,1,$18 # size-- - addq $5,$3,$3 - cmpult $3,$5,$4 - stq $3,0($16) - addq $16,8,$16 # res_ptr++ - beq $18,.Lend2 # jump if size was == 2 - - .align 3 -.Loop: mulq $2,$19,$3 # $3 = prod_low - ldq $5,0($16) # $5 = *res_ptr - addq $4,$0,$0 # cy_limb = cy_limb + 'cy' - subq $18,1,$18 # size-- - umulh $2,$19,$4 # $4 = cy_limb - ldq $2,0($17) # $2 = s1_limb - addq $17,8,$17 # s1_ptr++ - addq $3,$0,$3 # $3 = cy_limb + prod_low - cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) - addq $5,$3,$3 - cmpult $3,$5,$5 - stq $3,0($16) - addq $16,8,$16 # res_ptr++ - addq $5,$0,$0 # combine carries - bne $18,.Loop - -.Lend2: mulq $2,$19,$3 # $3 = prod_low - ldq $5,0($16) # $5 = *res_ptr - addq $4,$0,$0 # cy_limb = cy_limb + 'cy' - umulh $2,$19,$4 # $4 = cy_limb - addq $3,$0,$3 # $3 = cy_limb + prod_low - cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) - addq $5,$3,$3 - cmpult $3,$5,$5 - stq $3,0($16) - addq $5,$0,$0 # combine carries - addq $4,$0,$0 # cy_limb = prod_high + cy - ret $31,($26),1 -.Lend1: addq $5,$3,$3 - cmpult $3,$5,$5 - stq $3,0($16) - addq $0,$5,$0 - ret $31,($26),1 - - .end __mpn_addmul_1 diff --git a/mpn/alpha/cntlz.s b/mpn/alpha/cntlz.s deleted file mode 100644 index e0f57c121..000000000 --- a/mpn/alpha/cntlz.s +++ /dev/null @@ -1,70 +0,0 @@ - # Alpha auxiliary for longlong.h's count_leading_zeros - - # Copyright (C) 1997 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - # DISCUSSION: - - # Other methods have been tried, and using a 128-entry table actually trims - # about 10% of the execution time (on a 21164) when the table is in the L1 - # cache. But under non-benchmarking conditions, the table will hardly be in - # the L1 cache. Tricky bit-fiddling methods with multiplies and magic tables - # are also possible, but they require many more instructions than the current - # code. (But for count_trailing_zeros, such tricks are beneficial.) - # Finally, converting to floating-point and extracting the exponent is much - # slower. - - .set noreorder - .set noat -.text - .align 3 - .globl __count_leading_zeros - .ent __count_leading_zeros 0 -__count_leading_zeros: - bis $31,63,$0 # initialize partial result count - - srl $16,32,$1 # shift down 32 steps -> r1 - cmovne $1,$1,$16 # select r1 if non-zero - cmovne $1,31,$0 # if r1 is nonzero choose smaller count - - srl $16,16,$1 # shift down 16 steps -> r1 - subq $0,16,$2 # generate new partial result count - cmovne $1,$1,$16 # choose new r1 if non-zero - cmovne $1,$2,$0 # choose new count if r1 was non-zero - - srl $16,8,$1 - subq $0,8,$2 - cmovne $1,$1,$16 - cmovne $1,$2,$0 - - srl $16,4,$1 - subq $0,4,$2 - cmovne $1,$1,$16 - cmovne $1,$2,$0 - - srl $16,2,$1 - subq $0,2,$2 - cmovne $1,$1,$16 - cmovne $1,$2,$0 - - srl $16,1,$1 # extract bit 1 - subq $0,$1,$0 # subtract it from partial result - - ret $31,($26),1 - .end __count_leading_zeros diff --git a/mpn/alpha/default.m4 b/mpn/alpha/default.m4 new file mode 100644 index 000000000..9b7e61a28 --- /dev/null +++ b/mpn/alpha/default.m4 @@ -0,0 +1,56 @@ +divert(-1) + +define(`ASM_START', + ` + .set noreorder + .set noat') + +define(`X',`0x$1') +define(`INT64', + ` + .align 3 +$1: .quad $2') + +define(`PROLOGUE', + ` + .text + .align 3 + .globl $1 + .ent $1 +$1: + .frame r30,0,r26 + .prologue 0') + +define(`PROLOGUE_GP', + ` + .text + .align 3 + .globl $1 + .ent $1 +$1: + ldgp r29,0(r27) + .frame r30,0,r26 + .prologue 1') + +define(`EPILOGUE', + ` + .end $1') + +dnl Map register names r0, r1, etc, to `$0', `$1', etc. +dnl This is needed on all systems but Unicos +forloop(i,0,31, +`define(`r'i,``$''i)' +) +forloop(i,0,31, +`define(`f'i,``$f''i)' +) + +define(`DATASTART', + `dnl + DATA +$1:') +define(`DATAEND',`dnl') + +define(`ASM_END',`dnl') + +divert diff --git a/mpn/alpha/ev5/add_n.asm b/mpn/alpha/ev5/add_n.asm new file mode 100644 index 000000000..9b3484aa9 --- /dev/null +++ b/mpn/alpha/ev5/add_n.asm @@ -0,0 +1,143 @@ +dnl Alpha EV5 __mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_add_n) + bis r31,r31,r25 C clear cy + subq r19,4,r19 C decr loop cnt + blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop +C Start software pipeline for 1st loop + ldq r0,0(r18) + ldq r4,0(r17) + ldq r1,8(r18) + ldq r5,8(r17) + addq r17,32,r17 C update s1_ptr + ldq r2,16(r18) + addq r0,r4,r20 C 1st main add + ldq r3,24(r18) + subq r19,4,r19 C decr loop cnt + ldq r6,-16(r17) + cmpult r20,r0,r25 C compute cy from last add + ldq r7,-8(r17) + addq r1,r5,r28 C 2nd main add + addq r18,32,r18 C update s2_ptr + addq r28,r25,r21 C 2nd carry add + cmpult r28,r5,r8 C compute cy from last add + blt r19,$Lend1 C if less than 4 limbs remain, jump +C 1st loop handles groups of 4 limbs in a software pipeline + ALIGN(16) +$Loop: cmpult r21,r28,r25 C compute cy from last add + ldq r0,0(r18) + bis r8,r25,r25 C combine cy from the two adds + ldq r1,8(r18) + addq r2,r6,r28 C 3rd main add + ldq r4,0(r17) + addq r28,r25,r22 C 3rd carry add + ldq r5,8(r17) + cmpult r28,r6,r8 C compute cy from last add + cmpult r22,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + stq r21,8(r16) + addq r3,r7,r28 C 4th main add + addq r28,r25,r23 C 4th carry add + cmpult r28,r7,r8 C compute cy from last add + cmpult r23,r28,r25 C compute cy from last add + addq r17,32,r17 C update s1_ptr + bis r8,r25,r25 C combine cy from the two adds + addq r16,32,r16 C update res_ptr + addq r0,r4,r28 C 1st main add + ldq r2,16(r18) + addq r25,r28,r20 C 1st carry add + ldq r3,24(r18) + cmpult r28,r4,r8 C compute cy from last add + ldq r6,-16(r17) + cmpult r20,r28,r25 C compute cy from last add + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two adds + subq r19,4,r19 C decr loop cnt + stq r22,-16(r16) + addq r1,r5,r28 C 2nd main add + stq r23,-8(r16) + addq r25,r28,r21 C 2nd carry add + addq r18,32,r18 C update s2_ptr + cmpult r28,r5,r8 C compute cy from last add + bge r19,$Loop +C Finish software pipeline for 1st loop +$Lend1: cmpult r21,r28,r25 C compute cy from last add + bis r8,r25,r25 C combine cy from the two adds + addq r2,r6,r28 C 3rd main add + addq r28,r25,r22 C 3rd carry add + cmpult r28,r6,r8 C compute cy from last add + cmpult r22,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + stq r21,8(r16) + addq r3,r7,r28 C 4th main add + addq r28,r25,r23 C 4th carry add + cmpult r28,r7,r8 C compute cy from last add + cmpult r23,r28,r25 C compute cy from last add + bis r8,r25,r25 C combine cy from the two adds + addq r16,32,r16 C update res_ptr + stq r22,-16(r16) + stq r23,-8(r16) +$Lend2: addq r19,4,r19 C restore loop cnt + beq r19,$Lret +C Start software pipeline for 2nd loop + ldq r0,0(r18) + ldq r4,0(r17) + subq r19,1,r19 + beq r19,$Lend0 +C 2nd loop handles remaining 1-3 limbs + ALIGN(16) +$Loop0: addq r0,r4,r28 C main add + ldq r0,8(r18) + cmpult r28,r4,r8 C compute cy from last add + ldq r4,8(r17) + addq r28,r25,r20 C carry add + addq r18,8,r18 + addq r17,8,r17 + stq r20,0(r16) + cmpult r20,r28,r25 C compute cy from last add + subq r19,1,r19 C decr loop cnt + bis r8,r25,r25 C combine cy from the two adds + addq r16,8,r16 + bne r19,$Loop0 +$Lend0: addq r0,r4,r28 C main add + addq r28,r25,r20 C carry add + cmpult r28,r4,r8 C compute cy from last add + cmpult r20,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + +$Lret: bis r25,r31,r0 C return cy + ret r31,(r26),1 +EPILOGUE(mpn_add_n) +ASM_END() diff --git a/mpn/alpha/ev5/add_n.s b/mpn/alpha/ev5/add_n.s deleted file mode 100644 index 66bb9b9fb..000000000 --- a/mpn/alpha/ev5/add_n.s +++ /dev/null @@ -1,148 +0,0 @@ - # Alpha EV5 __mpn_add_n -- Add two limb vectors of the same length > 0 and - # store sum in a third limb vector. - - # Copyright (C) 1995, 1999 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - - # INPUT PARAMETERS - # res_ptr $16 - # s1_ptr $17 - # s2_ptr $18 - # size $19 - - .set noreorder - .set noat -.text - .align 3 - .globl __mpn_add_n - .ent __mpn_add_n -__mpn_add_n: - .frame $30,0,$26,0 - - or $31,$31,$25 # clear cy - subq $19,4,$19 # decr loop cnt - blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop - # Start software pipeline for 1st loop - ldq $0,0($18) - ldq $4,0($17) - ldq $1,8($18) - ldq $5,8($17) - addq $17,32,$17 # update s1_ptr - ldq $2,16($18) - addq $0,$4,$20 # 1st main add - ldq $3,24($18) - subq $19,4,$19 # decr loop cnt - ldq $6,-16($17) - cmpult $20,$0,$25 # compute cy from last add - ldq $7,-8($17) - addq $1,$5,$28 # 2nd main add - addq $18,32,$18 # update s2_ptr - addq $28,$25,$21 # 2nd carry add - cmpult $28,$5,$8 # compute cy from last add - blt $19,.Lend1 # if less than 4 limbs remain, jump - # 1st loop handles groups of 4 limbs in a software pipeline - .align 4 -.Loop: cmpult $21,$28,$25 # compute cy from last add - ldq $0,0($18) - or $8,$25,$25 # combine cy from the two adds - ldq $1,8($18) - addq $2,$6,$28 # 3rd main add - ldq $4,0($17) - addq $28,$25,$22 # 3rd carry add - ldq $5,8($17) - cmpult $28,$6,$8 # compute cy from last add - cmpult $22,$28,$25 # compute cy from last add - stq $20,0($16) - or $8,$25,$25 # combine cy from the two adds - stq $21,8($16) - addq $3,$7,$28 # 4th main add - addq $28,$25,$23 # 4th carry add - cmpult $28,$7,$8 # compute cy from last add - cmpult $23,$28,$25 # compute cy from last add - addq $17,32,$17 # update s1_ptr - or $8,$25,$25 # combine cy from the two adds - addq $16,32,$16 # update res_ptr - addq $0,$4,$28 # 1st main add - ldq $2,16($18) - addq $25,$28,$20 # 1st carry add - ldq $3,24($18) - cmpult $28,$4,$8 # compute cy from last add - ldq $6,-16($17) - cmpult $20,$28,$25 # compute cy from last add - ldq $7,-8($17) - or $8,$25,$25 # combine cy from the two adds - subq $19,4,$19 # decr loop cnt - stq $22,-16($16) - addq $1,$5,$28 # 2nd main add - stq $23,-8($16) - addq $25,$28,$21 # 2nd carry add - addq $18,32,$18 # update s2_ptr - cmpult $28,$5,$8 # compute cy from last add - bge $19,.Loop - # Finish software pipeline for 1st loop -.Lend1: cmpult $21,$28,$25 # compute cy from last add - or $8,$25,$25 # combine cy from the two adds - addq $2,$6,$28 # 3rd main add - addq $28,$25,$22 # 3rd carry add - cmpult $28,$6,$8 # compute cy from last add - cmpult $22,$28,$25 # compute cy from last add - stq $20,0($16) - or $8,$25,$25 # combine cy from the two adds - stq $21,8($16) - addq $3,$7,$28 # 4th main add - addq $28,$25,$23 # 4th carry add - cmpult $28,$7,$8 # compute cy from last add - cmpult $23,$28,$25 # compute cy from last add - or $8,$25,$25 # combine cy from the two adds - addq $16,32,$16 # update res_ptr - stq $22,-16($16) - stq $23,-8($16) -.Lend2: addq $19,4,$19 # restore loop cnt - beq $19,.Lret - # Start software pipeline for 2nd loop - ldq $0,0($18) - ldq $4,0($17) - subq $19,1,$19 - beq $19,.Lend0 - # 2nd loop handles remaining 1-3 limbs - .align 4 -.Loop0: addq $0,$4,$28 # main add - ldq $0,8($18) - cmpult $28,$4,$8 # compute cy from last add - ldq $4,8($17) - addq $28,$25,$20 # carry add - addq $18,8,$18 - addq $17,8,$17 - stq $20,0($16) - cmpult $20,$28,$25 # compute cy from last add - subq $19,1,$19 # decr loop cnt - or $8,$25,$25 # combine cy from the two adds - addq $16,8,$16 - bne $19,.Loop0 -.Lend0: addq $0,$4,$28 # main add - addq $28,$25,$20 # carry add - cmpult $28,$4,$8 # compute cy from last add - cmpult $20,$28,$25 # compute cy from last add - stq $20,0($16) - or $8,$25,$25 # combine cy from the two adds - -.Lret: or $25,$31,$0 # return cy - ret $31,($26),1 - .end __mpn_add_n diff --git a/mpn/alpha/ev5/lshift.asm b/mpn/alpha/ev5/lshift.asm new file mode 100644 index 000000000..23b9e8a10 --- /dev/null +++ b/mpn/alpha/ev5/lshift.asm @@ -0,0 +1,169 @@ +dnl Alpha EV5 __mpn_lshift -- Shift a number left. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 3.25 cycles/limb on the EV5. + +ASM_START() +PROLOGUE(mpn_lshift) + s8addq r18,r17,r17 C make r17 point at end of s1 + ldq r4,-8(r17) C load first limb + subq r31,r19,r20 + s8addq r18,r16,r16 C make r16 point at end of RES + subq r18,1,r18 + and r18,4-1,r28 C number of limbs in first loop + srl r4,r20,r0 C compute function result + + beq r28,$L0 + subq r18,r28,r18 + + ALIGN(8) +$Loop0: ldq r3,-16(r17) + subq r16,8,r16 + sll r4,r19,r5 + subq r17,8,r17 + subq r28,1,r28 + srl r3,r20,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,0(r16) + bne r28,$Loop0 + +$L0: sll r4,r19,r24 + beq r18,$Lend +C warm up phase 1 + ldq r1,-16(r17) + subq r18,4,r18 + ldq r2,-24(r17) + ldq r3,-32(r17) + ldq r4,-40(r17) + beq r18,$Lend1 +C warm up phase 2 + srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + ldq r1,-48(r17) + sll r2,r19,r22 + ldq r2,-56(r17) + srl r3,r20,r5 + bis r7,r24,r7 + sll r3,r19,r23 + bis r8,r21,r8 + srl r4,r20,r6 + ldq r3,-64(r17) + sll r4,r19,r24 + ldq r4,-72(r17) + subq r18,4,r18 + beq r18,$Lend2 + ALIGN(16) +C main loop +$Loop: stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + + srl r1,r20,r7 + subq r18,4,r18 + sll r1,r19,r21 + unop C ldq r31,-96(r17) + + srl r2,r20,r8 + ldq r1,-80(r17) + sll r2,r19,r22 + ldq r2,-88(r17) + + stq r5,-24(r16) + bis r7,r24,r7 + stq r6,-32(r16) + bis r8,r21,r8 + + srl r3,r20,r5 + unop C ldq r31,-96(r17) + sll r3,r19,r23 + subq r16,32,r16 + + srl r4,r20,r6 + ldq r3,-96(r17) + sll r4,r19,r24 + ldq r4,-104(r17) + + subq r17,32,r17 + bne r18,$Loop +C cool down phase 2/1 +$Lend2: stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + sll r2,r19,r22 + stq r5,-24(r16) + bis r7,r24,r7 + stq r6,-32(r16) + bis r8,r21,r8 + srl r3,r20,r5 + sll r3,r19,r23 + srl r4,r20,r6 + sll r4,r19,r24 +C cool down phase 2/2 + stq r7,-40(r16) + bis r5,r22,r5 + stq r8,-48(r16) + bis r6,r23,r6 + stq r5,-56(r16) + stq r6,-64(r16) +C cool down phase 2/3 + stq r24,-72(r16) + ret r31,(r26),1 + +C cool down phase 1/1 +$Lend1: srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + sll r2,r19,r22 + srl r3,r20,r5 + bis r7,r24,r7 + sll r3,r19,r23 + bis r8,r21,r8 + srl r4,r20,r6 + sll r4,r19,r24 +C cool down phase 1/2 + stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + stq r5,-24(r16) + stq r6,-32(r16) + stq r24,-40(r16) + ret r31,(r26),1 + +$Lend: stq r24,-8(r16) + ret r31,(r26),1 +EPILOGUE(mpn_lshift) +ASM_END() diff --git a/mpn/alpha/ev5/lshift.s b/mpn/alpha/ev5/lshift.s deleted file mode 100644 index ced55b720..000000000 --- a/mpn/alpha/ev5/lshift.s +++ /dev/null @@ -1,174 +0,0 @@ - # Alpha EV5 __mpn_lshift -- - - # Copyright (C) 1994, 1995 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - - # INPUT PARAMETERS - # res_ptr r16 - # s1_ptr r17 - # size r18 - # cnt r19 - - # This code runs at 3.25 cycles/limb on the EV5. - - .set noreorder - .set noat -.text - .align 3 - .globl __mpn_lshift - .ent __mpn_lshift -__mpn_lshift: - .frame $30,0,$26,0 - - s8addq $18,$17,$17 # make r17 point at end of s1 - ldq $4,-8($17) # load first limb - subq $31,$19,$20 - s8addq $18,$16,$16 # make r16 point at end of RES - subq $18,1,$18 - and $18,4-1,$28 # number of limbs in first loop - srl $4,$20,$0 # compute function result - - beq $28,.L0 - subq $18,$28,$18 - - .align 3 -.Loop0: ldq $3,-16($17) - subq $16,8,$16 - sll $4,$19,$5 - subq $17,8,$17 - subq $28,1,$28 - srl $3,$20,$6 - or $3,$3,$4 - or $5,$6,$8 - stq $8,0($16) - bne $28,.Loop0 - -.L0: sll $4,$19,$24 - beq $18,.Lend - # warm up phase 1 - ldq $1,-16($17) - subq $18,4,$18 - ldq $2,-24($17) - ldq $3,-32($17) - ldq $4,-40($17) - beq $18,.Lend1 - # warm up phase 2 - srl $1,$20,$7 - sll $1,$19,$21 - srl $2,$20,$8 - ldq $1,-48($17) - sll $2,$19,$22 - ldq $2,-56($17) - srl $3,$20,$5 - or $7,$24,$7 - sll $3,$19,$23 - or $8,$21,$8 - srl $4,$20,$6 - ldq $3,-64($17) - sll $4,$19,$24 - ldq $4,-72($17) - subq $18,4,$18 - beq $18,.Lend2 - .align 4 - # main loop -.Loop: stq $7,-8($16) - or $5,$22,$5 - stq $8,-16($16) - or $6,$23,$6 - - srl $1,$20,$7 - subq $18,4,$18 - sll $1,$19,$21 - unop # ldq $31,-96($17) - - srl $2,$20,$8 - ldq $1,-80($17) - sll $2,$19,$22 - ldq $2,-88($17) - - stq $5,-24($16) - or $7,$24,$7 - stq $6,-32($16) - or $8,$21,$8 - - srl $3,$20,$5 - unop # ldq $31,-96($17) - sll $3,$19,$23 - subq $16,32,$16 - - srl $4,$20,$6 - ldq $3,-96($17) - sll $4,$19,$24 - ldq $4,-104($17) - - subq $17,32,$17 - bne $18,.Loop - # cool down phase 2/1 -.Lend2: stq $7,-8($16) - or $5,$22,$5 - stq $8,-16($16) - or $6,$23,$6 - srl $1,$20,$7 - sll $1,$19,$21 - srl $2,$20,$8 - sll $2,$19,$22 - stq $5,-24($16) - or $7,$24,$7 - stq $6,-32($16) - or $8,$21,$8 - srl $3,$20,$5 - sll $3,$19,$23 - srl $4,$20,$6 - sll $4,$19,$24 - # cool down phase 2/2 - stq $7,-40($16) - or $5,$22,$5 - stq $8,-48($16) - or $6,$23,$6 - stq $5,-56($16) - stq $6,-64($16) - # cool down phase 2/3 - stq $24,-72($16) - ret $31,($26),1 - - # cool down phase 1/1 -.Lend1: srl $1,$20,$7 - sll $1,$19,$21 - srl $2,$20,$8 - sll $2,$19,$22 - srl $3,$20,$5 - or $7,$24,$7 - sll $3,$19,$23 - or $8,$21,$8 - srl $4,$20,$6 - sll $4,$19,$24 - # cool down phase 1/2 - stq $7,-8($16) - or $5,$22,$5 - stq $8,-16($16) - or $6,$23,$6 - stq $5,-24($16) - stq $6,-32($16) - stq $24,-40($16) - ret $31,($26),1 - -.Lend: stq $24,-8($16) - ret $31,($26),1 - .end __mpn_lshift diff --git a/mpn/alpha/ev5/rshift.asm b/mpn/alpha/ev5/rshift.asm new file mode 100644 index 000000000..c3325579f --- /dev/null +++ b/mpn/alpha/ev5/rshift.asm @@ -0,0 +1,167 @@ +dnl Alpha EV5 __mpn_rshift -- Shift a number right. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 3.25 cycles/limb on the EV5. + +ASM_START() +PROLOGUE(mpn_rshift) + ldq r4,0(r17) C load first limb + subq r31,r19,r20 + subq r18,1,r18 + and r18,4-1,r28 C number of limbs in first loop + sll r4,r20,r0 C compute function result + + beq r28,$L0 + subq r18,r28,r18 + + ALIGN(8) +$Loop0: ldq r3,8(r17) + addq r16,8,r16 + srl r4,r19,r5 + addq r17,8,r17 + subq r28,1,r28 + sll r3,r20,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,-8(r16) + bne r28,$Loop0 + +$L0: srl r4,r19,r24 + beq r18,$Lend +C warm up phase 1 + ldq r1,8(r17) + subq r18,4,r18 + ldq r2,16(r17) + ldq r3,24(r17) + ldq r4,32(r17) + beq r18,$Lend1 +C warm up phase 2 + sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + ldq r1,40(r17) + srl r2,r19,r22 + ldq r2,48(r17) + sll r3,r20,r5 + bis r7,r24,r7 + srl r3,r19,r23 + bis r8,r21,r8 + sll r4,r20,r6 + ldq r3,56(r17) + srl r4,r19,r24 + ldq r4,64(r17) + subq r18,4,r18 + beq r18,$Lend2 + ALIGN(16) +C main loop +$Loop: stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + + sll r1,r20,r7 + subq r18,4,r18 + srl r1,r19,r21 + unop C ldq r31,-96(r17) + + sll r2,r20,r8 + ldq r1,72(r17) + srl r2,r19,r22 + ldq r2,80(r17) + + stq r5,16(r16) + bis r7,r24,r7 + stq r6,24(r16) + bis r8,r21,r8 + + sll r3,r20,r5 + unop C ldq r31,-96(r17) + srl r3,r19,r23 + addq r16,32,r16 + + sll r4,r20,r6 + ldq r3,88(r17) + srl r4,r19,r24 + ldq r4,96(r17) + + addq r17,32,r17 + bne r18,$Loop +C cool down phase 2/1 +$Lend2: stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + srl r2,r19,r22 + stq r5,16(r16) + bis r7,r24,r7 + stq r6,24(r16) + bis r8,r21,r8 + sll r3,r20,r5 + srl r3,r19,r23 + sll r4,r20,r6 + srl r4,r19,r24 +C cool down phase 2/2 + stq r7,32(r16) + bis r5,r22,r5 + stq r8,40(r16) + bis r6,r23,r6 + stq r5,48(r16) + stq r6,56(r16) +C cool down phase 2/3 + stq r24,64(r16) + ret r31,(r26),1 + +C cool down phase 1/1 +$Lend1: sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + srl r2,r19,r22 + sll r3,r20,r5 + bis r7,r24,r7 + srl r3,r19,r23 + bis r8,r21,r8 + sll r4,r20,r6 + srl r4,r19,r24 +C cool down phase 1/2 + stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + stq r5,16(r16) + stq r6,24(r16) + stq r24,32(r16) + ret r31,(r26),1 + +$Lend: stq r24,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_rshift) +ASM_END() diff --git a/mpn/alpha/ev5/rshift.s b/mpn/alpha/ev5/rshift.s deleted file mode 100644 index 6e24fef96..000000000 --- a/mpn/alpha/ev5/rshift.s +++ /dev/null @@ -1,172 +0,0 @@ - # Alpha EV5 __mpn_rshift -- - - # Copyright (C) 1994, 1995 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - - # INPUT PARAMETERS - # res_ptr r16 - # s1_ptr r17 - # size r18 - # cnt r19 - - # This code runs at 3.25 cycles/limb on the EV5. - - .set noreorder - .set noat -.text - .align 3 - .globl __mpn_rshift - .ent __mpn_rshift -__mpn_rshift: - .frame $30,0,$26,0 - - ldq $4,0($17) # load first limb - subq $31,$19,$20 - subq $18,1,$18 - and $18,4-1,$28 # number of limbs in first loop - sll $4,$20,$0 # compute function result - - beq $28,.L0 - subq $18,$28,$18 - - .align 3 -.Loop0: ldq $3,8($17) - addq $16,8,$16 - srl $4,$19,$5 - addq $17,8,$17 - subq $28,1,$28 - sll $3,$20,$6 - or $3,$3,$4 - or $5,$6,$8 - stq $8,-8($16) - bne $28,.Loop0 - -.L0: srl $4,$19,$24 - beq $18,.Lend - # warm up phase 1 - ldq $1,8($17) - subq $18,4,$18 - ldq $2,16($17) - ldq $3,24($17) - ldq $4,32($17) - beq $18,.Lend1 - # warm up phase 2 - sll $1,$20,$7 - srl $1,$19,$21 - sll $2,$20,$8 - ldq $1,40($17) - srl $2,$19,$22 - ldq $2,48($17) - sll $3,$20,$5 - or $7,$24,$7 - srl $3,$19,$23 - or $8,$21,$8 - sll $4,$20,$6 - ldq $3,56($17) - srl $4,$19,$24 - ldq $4,64($17) - subq $18,4,$18 - beq $18,.Lend2 - .align 4 - # main loop -.Loop: stq $7,0($16) - or $5,$22,$5 - stq $8,8($16) - or $6,$23,$6 - - sll $1,$20,$7 - subq $18,4,$18 - srl $1,$19,$21 - unop # ldq $31,-96($17) - - sll $2,$20,$8 - ldq $1,72($17) - srl $2,$19,$22 - ldq $2,80($17) - - stq $5,16($16) - or $7,$24,$7 - stq $6,24($16) - or $8,$21,$8 - - sll $3,$20,$5 - unop # ldq $31,-96($17) - srl $3,$19,$23 - addq $16,32,$16 - - sll $4,$20,$6 - ldq $3,88($17) - srl $4,$19,$24 - ldq $4,96($17) - - addq $17,32,$17 - bne $18,.Loop - # cool down phase 2/1 -.Lend2: stq $7,0($16) - or $5,$22,$5 - stq $8,8($16) - or $6,$23,$6 - sll $1,$20,$7 - srl $1,$19,$21 - sll $2,$20,$8 - srl $2,$19,$22 - stq $5,16($16) - or $7,$24,$7 - stq $6,24($16) - or $8,$21,$8 - sll $3,$20,$5 - srl $3,$19,$23 - sll $4,$20,$6 - srl $4,$19,$24 - # cool down phase 2/2 - stq $7,32($16) - or $5,$22,$5 - stq $8,40($16) - or $6,$23,$6 - stq $5,48($16) - stq $6,56($16) - # cool down phase 2/3 - stq $24,64($16) - ret $31,($26),1 - - # cool down phase 1/1 -.Lend1: sll $1,$20,$7 - srl $1,$19,$21 - sll $2,$20,$8 - srl $2,$19,$22 - sll $3,$20,$5 - or $7,$24,$7 - srl $3,$19,$23 - or $8,$21,$8 - sll $4,$20,$6 - srl $4,$19,$24 - # cool down phase 1/2 - stq $7,0($16) - or $5,$22,$5 - stq $8,8($16) - or $6,$23,$6 - stq $5,16($16) - stq $6,24($16) - stq $24,32($16) - ret $31,($26),1 - -.Lend: stq $24,0($16) - ret $31,($26),1 - .end __mpn_rshift diff --git a/mpn/alpha/ev5/sub_n.asm b/mpn/alpha/ev5/sub_n.asm new file mode 100644 index 000000000..213c2c885 --- /dev/null +++ b/mpn/alpha/ev5/sub_n.asm @@ -0,0 +1,143 @@ +dnl Alpha EV5 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 +dnl and store difference in a third limb vector. + +dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_sub_n) + bis r31,r31,r25 C clear cy + subq r19,4,r19 C decr loop cnt + blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop +C Start software pipeline for 1st loop + ldq r0,0(r18) + ldq r4,0(r17) + ldq r1,8(r18) + ldq r5,8(r17) + addq r17,32,r17 C update s1_ptr + ldq r2,16(r18) + subq r4,r0,r20 C 1st main subtract + ldq r3,24(r18) + subq r19,4,r19 C decr loop cnt + ldq r6,-16(r17) + cmpult r4,r0,r25 C compute cy from last subtract + ldq r7,-8(r17) + subq r5,r1,r28 C 2nd main subtract + addq r18,32,r18 C update s2_ptr + subq r28,r25,r21 C 2nd carry subtract + cmpult r5,r1,r8 C compute cy from last subtract + blt r19,$Lend1 C if less than 4 limbs remain, jump +C 1st loop handles groups of 4 limbs in a software pipeline + ALIGN(16) +$Loop: cmpult r28,r25,r25 C compute cy from last subtract + ldq r0,0(r18) + bis r8,r25,r25 C combine cy from the two subtracts + ldq r1,8(r18) + subq r6,r2,r28 C 3rd main subtract + ldq r4,0(r17) + subq r28,r25,r22 C 3rd carry subtract + ldq r5,8(r17) + cmpult r6,r2,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + stq r21,8(r16) + subq r7,r3,r28 C 4th main subtract + subq r28,r25,r23 C 4th carry subtract + cmpult r7,r3,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + addq r17,32,r17 C update s1_ptr + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,32,r16 C update res_ptr + subq r4,r0,r28 C 1st main subtract + ldq r2,16(r18) + subq r28,r25,r20 C 1st carry subtract + ldq r3,24(r18) + cmpult r4,r0,r8 C compute cy from last subtract + ldq r6,-16(r17) + cmpult r28,r25,r25 C compute cy from last subtract + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two subtracts + subq r19,4,r19 C decr loop cnt + stq r22,-16(r16) + subq r5,r1,r28 C 2nd main subtract + stq r23,-8(r16) + subq r28,r25,r21 C 2nd carry subtract + addq r18,32,r18 C update s2_ptr + cmpult r5,r1,r8 C compute cy from last subtract + bge r19,$Loop +C Finish software pipeline for 1st loop +$Lend1: cmpult r28,r25,r25 C compute cy from last subtract + bis r8,r25,r25 C combine cy from the two subtracts + subq r6,r2,r28 C cy add + subq r28,r25,r22 C 3rd main subtract + cmpult r6,r2,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + stq r21,8(r16) + subq r7,r3,r28 C cy add + subq r28,r25,r23 C 4th main subtract + cmpult r7,r3,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,32,r16 C update res_ptr + stq r22,-16(r16) + stq r23,-8(r16) +$Lend2: addq r19,4,r19 C restore loop cnt + beq r19,$Lret +C Start software pipeline for 2nd loop + ldq r0,0(r18) + ldq r4,0(r17) + subq r19,1,r19 + beq r19,$Lend0 +C 2nd loop handles remaining 1-3 limbs + ALIGN(16) +$Loop0: subq r4,r0,r28 C main subtract + cmpult r4,r0,r8 C compute cy from last subtract + ldq r0,8(r18) + ldq r4,8(r17) + subq r28,r25,r20 C carry subtract + addq r18,8,r18 + addq r17,8,r17 + stq r20,0(r16) + cmpult r28,r25,r25 C compute cy from last subtract + subq r19,1,r19 C decr loop cnt + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,8,r16 + bne r19,$Loop0 +$Lend0: subq r4,r0,r28 C main subtract + subq r28,r25,r20 C carry subtract + cmpult r4,r0,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + +$Lret: bis r25,r31,r0 C return cy + ret r31,(r26),1 +EPILOGUE(mpn_sub_n) +ASM_END() diff --git a/mpn/alpha/ev5/sub_n.s b/mpn/alpha/ev5/sub_n.s deleted file mode 100644 index 36994b956..000000000 --- a/mpn/alpha/ev5/sub_n.s +++ /dev/null @@ -1,148 +0,0 @@ - # Alpha EV5 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 - # and store difference in a third limb vector. - - # Copyright (C) 1995, 1999 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - - # INPUT PARAMETERS - # res_ptr $16 - # s1_ptr $17 - # s2_ptr $18 - # size $19 - - .set noreorder - .set noat -.text - .align 3 - .globl __mpn_sub_n - .ent __mpn_sub_n -__mpn_sub_n: - .frame $30,0,$26,0 - - or $31,$31,$25 # clear cy - subq $19,4,$19 # decr loop cnt - blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop - # Start software pipeline for 1st loop - ldq $0,0($18) - ldq $4,0($17) - ldq $1,8($18) - ldq $5,8($17) - addq $17,32,$17 # update s1_ptr - ldq $2,16($18) - subq $4,$0,$20 # 1st main subtract - ldq $3,24($18) - subq $19,4,$19 # decr loop cnt - ldq $6,-16($17) - cmpult $4,$0,$25 # compute cy from last subtract - ldq $7,-8($17) - subq $5,$1,$28 # 2nd main subtract - addq $18,32,$18 # update s2_ptr - subq $28,$25,$21 # 2nd carry subtract - cmpult $5,$1,$8 # compute cy from last subtract - blt $19,.Lend1 # if less than 4 limbs remain, jump - # 1st loop handles groups of 4 limbs in a software pipeline - .align 4 -.Loop: cmpult $28,$25,$25 # compute cy from last subtract - ldq $0,0($18) - or $8,$25,$25 # combine cy from the two subtracts - ldq $1,8($18) - subq $6,$2,$28 # 3rd main subtract - ldq $4,0($17) - subq $28,$25,$22 # 3rd carry subtract - ldq $5,8($17) - cmpult $6,$2,$8 # compute cy from last subtract - cmpult $28,$25,$25 # compute cy from last subtract - stq $20,0($16) - or $8,$25,$25 # combine cy from the two subtracts - stq $21,8($16) - subq $7,$3,$28 # 4th main subtract - subq $28,$25,$23 # 4th carry subtract - cmpult $7,$3,$8 # compute cy from last subtract - cmpult $28,$25,$25 # compute cy from last subtract - addq $17,32,$17 # update s1_ptr - or $8,$25,$25 # combine cy from the two subtracts - addq $16,32,$16 # update res_ptr - subq $4,$0,$28 # 1st main subtract - ldq $2,16($18) - subq $28,$25,$20 # 1st carry subtract - ldq $3,24($18) - cmpult $4,$0,$8 # compute cy from last subtract - ldq $6,-16($17) - cmpult $28,$25,$25 # compute cy from last subtract - ldq $7,-8($17) - or $8,$25,$25 # combine cy from the two subtracts - subq $19,4,$19 # decr loop cnt - stq $22,-16($16) - subq $5,$1,$28 # 2nd main subtract - stq $23,-8($16) - subq $28,$25,$21 # 2nd carry subtract - addq $18,32,$18 # update s2_ptr - cmpult $5,$1,$8 # compute cy from last subtract - bge $19,.Loop - # Finish software pipeline for 1st loop -.Lend1: cmpult $28,$25,$25 # compute cy from last subtract - or $8,$25,$25 # combine cy from the two subtracts - subq $6,$2,$28 # cy add - subq $28,$25,$22 # 3rd main subtract - cmpult $6,$2,$8 # compute cy from last subtract - cmpult $28,$25,$25 # compute cy from last subtract - stq $20,0($16) - or $8,$25,$25 # combine cy from the two subtracts - stq $21,8($16) - subq $7,$3,$28 # cy add - subq $28,$25,$23 # 4th main subtract - cmpult $7,$3,$8 # compute cy from last subtract - cmpult $28,$25,$25 # compute cy from last subtract - or $8,$25,$25 # combine cy from the two subtracts - addq $16,32,$16 # update res_ptr - stq $22,-16($16) - stq $23,-8($16) -.Lend2: addq $19,4,$19 # restore loop cnt - beq $19,.Lret - # Start software pipeline for 2nd loop - ldq $0,0($18) - ldq $4,0($17) - subq $19,1,$19 - beq $19,.Lend0 - # 2nd loop handles remaining 1-3 limbs - .align 4 -.Loop0: subq $4,$0,$28 # main subtract - cmpult $4,$0,$8 # compute cy from last subtract - ldq $0,8($18) - ldq $4,8($17) - subq $28,$25,$20 # carry subtract - addq $18,8,$18 - addq $17,8,$17 - stq $20,0($16) - cmpult $28,$25,$25 # compute cy from last subtract - subq $19,1,$19 # decr loop cnt - or $8,$25,$25 # combine cy from the two subtracts - addq $16,8,$16 - bne $19,.Loop0 -.Lend0: subq $4,$0,$28 # main subtract - subq $28,$25,$20 # carry subtract - cmpult $4,$0,$8 # compute cy from last subtract - cmpult $28,$25,$25 # compute cy from last subtract - stq $20,0($16) - or $8,$25,$25 # combine cy from the two subtracts - -.Lret: or $25,$31,$0 # return cy - ret $31,($26),1 - .end __mpn_sub_n diff --git a/mpn/alpha/invert-limb.s b/mpn/alpha/invert_limb.asm index 9706f4b76..9e5cb22a1 100644 --- a/mpn/alpha/invert-limb.s +++ b/mpn/alpha/invert_limb.asm @@ -1,101 +1,90 @@ - # Alpha mpn_invert_normalized_limb -- Invert a normalized limb. +dnl Alpha mpn_invert_normalized_limb -- Invert a normalized limb. - # Copyright (C) 1996 Free Software Foundation, Inc. +dnl Copyright (C) 1996, 2000 Free Software Foundation, Inc. - # This file is part of the GNU MP Library. +dnl This file is part of the GNU MP Library. - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or (at your +dnl option) any later version. - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. - # - # This is based on sophie:/gmp-stuff/dbg-inv-limb.c. - # The ideas are due to Peter L. Montgomery - # - # The table below uses 4096 bytes. The file mentioned above has an - # alternative function that doesn't require the table, but it runs 50% - # slower than this. +dnl +dnl This is based on sophie:/gmp-stuff/dbg-inv-limb.c. +dnl The ideas are due to Peter L. Montgomery +dnl +dnl The table below uses 4096 bytes. The file mentioned above has an +dnl alternative function that doesn't require the table, but it runs 50% +dnl slower than this. - .set noreorder - .set volatile - .set noat -.text - .align 3 -$C36: - .t_floating 9223372036854775808.0 -.text - .align 3 - .globl __mpn_invert_normalized_limb - .ent __mpn_invert_normalized_limb -__mpn_invert_normalized_limb: - ldgp $29,0($27) -__mpn_invert_normalized_limb..ng: - lda $30,-16($30) - .frame $30,16,$26,0 - .prologue 1 - addq $16,$16,$1 - bne $1,$73 - lda $0,-1 - br $31,.Lend +include(`../config.m4') + +ASM_START() + +INT64($C36,X(43e0000000000000)) C 2^63 + +PROLOGUE_GP(mpn_invert_normalized_limb) + lda r30,-16(r30) + addq r16,r16,r1 + bne r1,$73 + lda r0,-1 + br r31,$Lend $73: - srl $16,1,$1 - stq $1,0($30) - ldt $f11,0($30) - cvtqt $f11,$f1 - lda $1,$C36 - ldt $f10,0($1) - divt $f10,$f1,$f10 - lda $2,invtab-4096 - srl $16,52,$1 - addq $1,$1,$1 - addq $1,$2,$1 - bic $1,6,$2 - ldq $2,0($2) - bic $1,1,$1 - extwl $2,$1,$2 - sll $2,48,$0 - umulh $16,$0,$1 - addq $16,$1,$3 - stq $3,0($30) - ldt $f11,0($30) - cvtqt $f11,$f1 - mult $f1,$f10,$f1 - cvttqc $f1,$f1 - stt $f1,0($30) - ldq $4,0($30) - subq $0,$4,$0 - umulh $16,$0,$1 - mulq $16,$0,$2 - addq $16,$1,$3 - bge $3,.Loop2 -.Loop1: addq $2,$16,$2 - cmpult $2,$16,$1 - addq $3,$1,$3 - addq $0,1,$0 - blt $3,.Loop1 -.Loop2: cmpult $2,$16,$1 - subq $0,1,$0 - subq $3,$1,$3 - subq $2,$16,$2 - bge $3,.Loop2 -.Lend: - addq $30,16,$30 - ret $31,($26),1 - .end __mpn_invert_normalized_limb -.text - .align 1 -invtab: + srl r16,1,r1 + stq r1,0(r30) + ldt f11,0(r30) + cvtqt f11,f1 + lda r1,$C36 + ldt f10,0(r1) + divt f10,f1,f10 + lda r2,$invtab-4096 + srl r16,52,r1 + addq r1,r1,r1 + addq r1,r2,r1 + bic r1,6,r2 + ldq r2,0(r2) + bic r1,1,r1 + extwl r2,r1,r2 + sll r2,48,r0 + umulh r16,r0,r1 + addq r16,r1,r3 + stq r3,0(r30) + ldt f11,0(r30) + cvtqt f11,f1 + mult f1,f10,f1 + cvttq/c f1,f1 + stt f1,0(r30) + ldq r4,0(r30) + subq r0,r4,r0 + umulh r16,r0,r1 + mulq r16,r0,r2 + addq r16,r1,r3 + bge r3,$Loop2 +$Loop1: addq r2,r16,r2 + cmpult r2,r16,r1 + addq r3,r1,r3 + addq r0,1,r0 + blt r3,$Loop1 +$Loop2: cmpult r2,r16,r1 + subq r0,1,r0 + subq r3,r1,r3 + subq r2,r16,r2 + bge r3,$Loop2 +$Lend: + lda r30,16(r30) + ret r31,(r26),1 +EPILOGUE(mpn_invert_normalized_limb) +DATASTART(`$invtab',4) .word 0xffff,0xffc0,0xff80,0xff40,0xff00,0xfec0,0xfe81,0xfe41 .word 0xfe01,0xfdc2,0xfd83,0xfd43,0xfd04,0xfcc5,0xfc86,0xfc46 .word 0xfc07,0xfbc8,0xfb8a,0xfb4b,0xfb0c,0xfacd,0xfa8e,0xfa50 @@ -352,3 +341,5 @@ invtab: .word 0x0182,0x0172,0x0161,0x0151,0x0141,0x0131,0x0121,0x0111 .word 0x0101,0x00f0,0x00e0,0x00d0,0x00c0,0x00b0,0x00a0,0x0090 .word 0x0080,0x0070,0x0060,0x0050,0x0040,0x0030,0x0020,0x0010 +DATAEND() +ASM_END() diff --git a/mpn/alpha/lshift.asm b/mpn/alpha/lshift.asm new file mode 100644 index 000000000..de0ce473c --- /dev/null +++ b/mpn/alpha/lshift.asm @@ -0,0 +1,104 @@ +dnl Alpha mpn_lshift -- Shift a number left. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, +dnl it would take 4 cycles/limb. It should be possible to get down to 3 +dnl cycles/limb since both ldq and stq can be paired with the other used +dnl instructions. But there are many restrictions in the 21064 pipeline that +dnl makes it hard, if not impossible, to get down to 3 cycles/limb: + +dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. +dnl 2. Only aligned instruction pairs can be paired. +dnl 3. The store buffer or silo might not be able to deal with the bandwidth. + +ASM_START() +PROLOGUE(mpn_lshift) + s8addq r18,r17,r17 C make r17 point at end of s1 + ldq r4,-8(r17) C load first limb + subq r17,8,r17 + subq r31,r19,r7 + s8addq r18,r16,r16 C make r16 point at end of RES + subq r18,1,r18 + and r18,4-1,r20 C number of limbs in first loop + srl r4,r7,r0 C compute function result + + beq r20,$L0 + subq r18,r20,r18 + + ALIGN(8) +$Loop0: + ldq r3,-8(r17) + subq r16,8,r16 + subq r17,8,r17 + subq r20,1,r20 + sll r4,r19,r5 + srl r3,r7,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,0(r16) + bne r20,$Loop0 + +$L0: beq r18,$Lend + + ALIGN(8) +$Loop: ldq r3,-8(r17) + subq r16,32,r16 + subq r18,4,r18 + sll r4,r19,r5 + srl r3,r7,r6 + + ldq r4,-16(r17) + sll r3,r19,r1 + bis r5,r6,r8 + stq r8,24(r16) + srl r4,r7,r2 + + ldq r3,-24(r17) + sll r4,r19,r5 + bis r1,r2,r8 + stq r8,16(r16) + srl r3,r7,r6 + + ldq r4,-32(r17) + sll r3,r19,r1 + bis r5,r6,r8 + stq r8,8(r16) + srl r4,r7,r2 + + subq r17,32,r17 + bis r1,r2,r8 + stq r8,0(r16) + + bgt r18,$Loop + +$Lend: sll r4,r19,r8 + stq r8,-8(r16) + ret r31,(r26),1 +EPILOGUE(mpn_lshift) +ASM_END() diff --git a/mpn/alpha/lshift.s b/mpn/alpha/lshift.s deleted file mode 100644 index 6a3e55a93..000000000 --- a/mpn/alpha/lshift.s +++ /dev/null @@ -1,109 +0,0 @@ - # Alpha __mpn_lshift -- - - # Copyright (C) 1994, 1995 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - - # INPUT PARAMETERS - # res_ptr r16 - # s1_ptr r17 - # size r18 - # cnt r19 - - # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, - # it would take 4 cycles/limb. It should be possible to get down to 3 - # cycles/limb since both ldq and stq can be paired with the other used - # instructions. But there are many restrictions in the 21064 pipeline that - # makes it hard, if not impossible, to get down to 3 cycles/limb: - - # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. - # 2. Only aligned instruction pairs can be paired. - # 3. The store buffer or silo might not be able to deal with the bandwidth. - - .set noreorder - .set noat -.text - .align 3 - .globl __mpn_lshift - .ent __mpn_lshift -__mpn_lshift: - .frame $30,0,$26,0 - - s8addq $18,$17,$17 # make r17 point at end of s1 - ldq $4,-8($17) # load first limb - subq $17,8,$17 - subq $31,$19,$7 - s8addq $18,$16,$16 # make r16 point at end of RES - subq $18,1,$18 - and $18,4-1,$20 # number of limbs in first loop - srl $4,$7,$0 # compute function result - - beq $20,.L0 - subq $18,$20,$18 - - .align 3 -.Loop0: - ldq $3,-8($17) - subq $16,8,$16 - subq $17,8,$17 - subq $20,1,$20 - sll $4,$19,$5 - srl $3,$7,$6 - bis $3,$3,$4 - bis $5,$6,$8 - stq $8,0($16) - bne $20,.Loop0 - -.L0: beq $18,.Lend - - .align 3 -.Loop: ldq $3,-8($17) - subq $16,32,$16 - subq $18,4,$18 - sll $4,$19,$5 - srl $3,$7,$6 - - ldq $4,-16($17) - sll $3,$19,$1 - bis $5,$6,$8 - stq $8,24($16) - srl $4,$7,$2 - - ldq $3,-24($17) - sll $4,$19,$5 - bis $1,$2,$8 - stq $8,16($16) - srl $3,$7,$6 - - ldq $4,-32($17) - sll $3,$19,$1 - bis $5,$6,$8 - stq $8,8($16) - srl $4,$7,$2 - - subq $17,32,$17 - bis $1,$2,$8 - stq $8,0($16) - - bgt $18,.Loop - -.Lend: sll $4,$19,$8 - stq $8,-8($16) - ret $31,($26),1 - .end __mpn_lshift diff --git a/mpn/alpha/mul_1.asm b/mpn/alpha/mul_1.asm new file mode 100644 index 000000000..94cd55c9c --- /dev/null +++ b/mpn/alpha/mul_1.asm @@ -0,0 +1,71 @@ +dnl Alpha __mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_mul_1) + ldq r2,0(r17) C r2 = s1_limb + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + bic r31,r31,r4 C clear cy_limb + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,8(r17) C r2 = s1_limb + subq r18,1,r18 C size-- + stq r3,0(r16) + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,16(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + stq r3,8(r16) + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r16,8,r16 C res_ptr++ + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + stq r3,8(r16) + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: stq r3,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_mul_1) +ASM_END() diff --git a/mpn/alpha/mul_1.s b/mpn/alpha/mul_1.s deleted file mode 100644 index 470c89368..000000000 --- a/mpn/alpha/mul_1.s +++ /dev/null @@ -1,85 +0,0 @@ - # Alpha __mpn_mul_1 -- Multiply a limb vector with a limb and store - # the result in a second limb vector. - - # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - - # INPUT PARAMETERS - # res_ptr r16 - # s1_ptr r17 - # size r18 - # s2_limb r19 - - # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5. - - # To improve performance for long multiplications, we would use - # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use - # these instructions without slowing down the general code: 1. We can - # only have two prefetches in operation at any time in the Alpha - # architecture. 2. There will seldom be any special alignment - # between RES_PTR and S1_PTR. Maybe we can simply divide the current - # loop into an inner and outer loop, having the inner loop handle - # exactly one prefetch block? - - .set noreorder - .set noat -.text - .align 3 - .globl __mpn_mul_1 - .ent __mpn_mul_1 2 -__mpn_mul_1: - .frame $30,0,$26 - - ldq $2,0($17) # $2 = s1_limb - subq $18,1,$18 # size-- - mulq $2,$19,$3 # $3 = prod_low - bic $31,$31,$4 # clear cy_limb - umulh $2,$19,$0 # $0 = prod_high - beq $18,.Lend1 # jump if size was == 1 - ldq $2,8($17) # $2 = s1_limb - subq $18,1,$18 # size-- - stq $3,0($16) - beq $18,.Lend2 # jump if size was == 2 - - .align 3 -.Loop: mulq $2,$19,$3 # $3 = prod_low - addq $4,$0,$0 # cy_limb = cy_limb + 'cy' - subq $18,1,$18 # size-- - umulh $2,$19,$4 # $4 = cy_limb - ldq $2,16($17) # $2 = s1_limb - addq $17,8,$17 # s1_ptr++ - addq $3,$0,$3 # $3 = cy_limb + prod_low - stq $3,8($16) - cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) - addq $16,8,$16 # res_ptr++ - bne $18,.Loop - -.Lend2: mulq $2,$19,$3 # $3 = prod_low - addq $4,$0,$0 # cy_limb = cy_limb + 'cy' - umulh $2,$19,$4 # $4 = cy_limb - addq $3,$0,$3 # $3 = cy_limb + prod_low - cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) - stq $3,8($16) - addq $4,$0,$0 # cy_limb = prod_high + cy - ret $31,($26),1 -.Lend1: stq $3,0($16) - ret $31,($26),1 - - .end __mpn_mul_1 diff --git a/mpn/alpha/rshift.asm b/mpn/alpha/rshift.asm new file mode 100644 index 000000000..4c111d237 --- /dev/null +++ b/mpn/alpha/rshift.asm @@ -0,0 +1,102 @@ +dnl Alpha mpn_rshift -- Shift a number right. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, +dnl it would take 4 cycles/limb. It should be possible to get down to 3 +dnl cycles/limb since both ldq and stq can be paired with the other used +dnl instructions. But there are many restrictions in the 21064 pipeline that +dnl makes it hard, if not impossible, to get down to 3 cycles/limb: + +dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. +dnl 2. Only aligned instruction pairs can be paired. +dnl 3. The store buffer or silo might not be able to deal with the bandwidth. + +ASM_START() +PROLOGUE(mpn_rshift) + ldq r4,0(r17) C load first limb + addq r17,8,r17 + subq r31,r19,r7 + subq r18,1,r18 + and r18,4-1,r20 C number of limbs in first loop + sll r4,r7,r0 C compute function result + + beq r20,$L0 + subq r18,r20,r18 + + ALIGN(8) +$Loop0: + ldq r3,0(r17) + addq r16,8,r16 + addq r17,8,r17 + subq r20,1,r20 + srl r4,r19,r5 + sll r3,r7,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,-8(r16) + bne r20,$Loop0 + +$L0: beq r18,$Lend + + ALIGN(8) +$Loop: ldq r3,0(r17) + addq r16,32,r16 + subq r18,4,r18 + srl r4,r19,r5 + sll r3,r7,r6 + + ldq r4,8(r17) + srl r3,r19,r1 + bis r5,r6,r8 + stq r8,-32(r16) + sll r4,r7,r2 + + ldq r3,16(r17) + srl r4,r19,r5 + bis r1,r2,r8 + stq r8,-24(r16) + sll r3,r7,r6 + + ldq r4,24(r17) + srl r3,r19,r1 + bis r5,r6,r8 + stq r8,-16(r16) + sll r4,r7,r2 + + addq r17,32,r17 + bis r1,r2,r8 + stq r8,-8(r16) + + bgt r18,$Loop + +$Lend: srl r4,r19,r8 + stq r8,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_rshift) +ASM_END() diff --git a/mpn/alpha/rshift.s b/mpn/alpha/rshift.s deleted file mode 100644 index 12a3e369d..000000000 --- a/mpn/alpha/rshift.s +++ /dev/null @@ -1,107 +0,0 @@ - # Alpha __mpn_rshift -- - - # Copyright (C) 1994, 1995 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - - # INPUT PARAMETERS - # res_ptr r16 - # s1_ptr r17 - # size r18 - # cnt r19 - - # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, - # it would take 4 cycles/limb. It should be possible to get down to 3 - # cycles/limb since both ldq and stq can be paired with the other used - # instructions. But there are many restrictions in the 21064 pipeline that - # makes it hard, if not impossible, to get down to 3 cycles/limb: - - # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. - # 2. Only aligned instruction pairs can be paired. - # 3. The store buffer or silo might not be able to deal with the bandwidth. - - .set noreorder - .set noat -.text - .align 3 - .globl __mpn_rshift - .ent __mpn_rshift -__mpn_rshift: - .frame $30,0,$26,0 - - ldq $4,0($17) # load first limb - addq $17,8,$17 - subq $31,$19,$7 - subq $18,1,$18 - and $18,4-1,$20 # number of limbs in first loop - sll $4,$7,$0 # compute function result - - beq $20,.L0 - subq $18,$20,$18 - - .align 3 -.Loop0: - ldq $3,0($17) - addq $16,8,$16 - addq $17,8,$17 - subq $20,1,$20 - srl $4,$19,$5 - sll $3,$7,$6 - bis $3,$3,$4 - bis $5,$6,$8 - stq $8,-8($16) - bne $20,.Loop0 - -.L0: beq $18,.Lend - - .align 3 -.Loop: ldq $3,0($17) - addq $16,32,$16 - subq $18,4,$18 - srl $4,$19,$5 - sll $3,$7,$6 - - ldq $4,8($17) - srl $3,$19,$1 - bis $5,$6,$8 - stq $8,-32($16) - sll $4,$7,$2 - - ldq $3,16($17) - srl $4,$19,$5 - bis $1,$2,$8 - stq $8,-24($16) - sll $3,$7,$6 - - ldq $4,24($17) - srl $3,$19,$1 - bis $5,$6,$8 - stq $8,-16($16) - sll $4,$7,$2 - - addq $17,32,$17 - bis $1,$2,$8 - stq $8,-8($16) - - bgt $18,.Loop - -.Lend: srl $4,$19,$8 - stq $8,0($16) - ret $31,($26),1 - .end __mpn_rshift diff --git a/mpn/alpha/sub_n.asm b/mpn/alpha/sub_n.asm new file mode 100644 index 000000000..e227af553 --- /dev/null +++ b/mpn/alpha/sub_n.asm @@ -0,0 +1,114 @@ +dnl Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_sub_n) + ldq r3,0(r17) + ldq r4,0(r18) + + subq r19,1,r19 + and r19,4-1,r2 C number of limbs in first loop + bis r31,r31,r0 + beq r2,$L0 C if multiple of 4 limbs, skip first loop + + subq r19,r2,r19 + +$Loop0: subq r2,1,r2 + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + + addq r17,8,r17 + addq r18,8,r18 + bis r5,r5,r3 + bis r6,r6,r4 + addq r16,8,r16 + bne r2,$Loop0 + +$L0: beq r19,$Lend + + ALIGN(8) +$Loop: subq r19,4,r19 + + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + + ldq r3,16(r17) + addq r6,r0,r6 + ldq r4,16(r18) + cmpult r6,r0,r1 + subq r5,r6,r6 + cmpult r5,r6,r0 + stq r6,8(r16) + bis r0,r1,r0 + + ldq r5,24(r17) + addq r4,r0,r4 + ldq r6,24(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,16(r16) + bis r0,r1,r0 + + ldq r3,32(r17) + addq r6,r0,r6 + ldq r4,32(r18) + cmpult r6,r0,r1 + subq r5,r6,r6 + cmpult r5,r6,r0 + stq r6,24(r16) + bis r0,r1,r0 + + addq r17,32,r17 + addq r18,32,r18 + addq r16,32,r16 + bne r19,$Loop + +$Lend: addq r4,r0,r4 + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + ret r31,(r26),1 +EPILOGUE(mpn_sub_n) +ASM_END() diff --git a/mpn/alpha/sub_n.s b/mpn/alpha/sub_n.s deleted file mode 100644 index 3c90c1169..000000000 --- a/mpn/alpha/sub_n.s +++ /dev/null @@ -1,120 +0,0 @@ - # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and - # store difference in a third limb vector. - - # Copyright (C) 1995 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - - # INPUT PARAMETERS - # res_ptr $16 - # s1_ptr $17 - # s2_ptr $18 - # size $19 - - .set noreorder - .set noat -.text - .align 3 - .globl __mpn_sub_n - .ent __mpn_sub_n -__mpn_sub_n: - .frame $30,0,$26,0 - - ldq $3,0($17) - ldq $4,0($18) - - subq $19,1,$19 - and $19,4-1,$2 # number of limbs in first loop - bis $31,$31,$0 - beq $2,.L0 # if multiple of 4 limbs, skip first loop - - subq $19,$2,$19 - -.Loop0: subq $2,1,$2 - ldq $5,8($17) - addq $4,$0,$4 - ldq $6,8($18) - cmpult $4,$0,$1 - subq $3,$4,$4 - cmpult $3,$4,$0 - stq $4,0($16) - or $0,$1,$0 - - addq $17,8,$17 - addq $18,8,$18 - bis $5,$5,$3 - bis $6,$6,$4 - addq $16,8,$16 - bne $2,.Loop0 - -.L0: beq $19,.Lend - - .align 3 -.Loop: subq $19,4,$19 - - ldq $5,8($17) - addq $4,$0,$4 - ldq $6,8($18) - cmpult $4,$0,$1 - subq $3,$4,$4 - cmpult $3,$4,$0 - stq $4,0($16) - or $0,$1,$0 - - ldq $3,16($17) - addq $6,$0,$6 - ldq $4,16($18) - cmpult $6,$0,$1 - subq $5,$6,$6 - cmpult $5,$6,$0 - stq $6,8($16) - or $0,$1,$0 - - ldq $5,24($17) - addq $4,$0,$4 - ldq $6,24($18) - cmpult $4,$0,$1 - subq $3,$4,$4 - cmpult $3,$4,$0 - stq $4,16($16) - or $0,$1,$0 - - ldq $3,32($17) - addq $6,$0,$6 - ldq $4,32($18) - cmpult $6,$0,$1 - subq $5,$6,$6 - cmpult $5,$6,$0 - stq $6,24($16) - or $0,$1,$0 - - addq $17,32,$17 - addq $18,32,$18 - addq $16,32,$16 - bne $19,.Loop - -.Lend: addq $4,$0,$4 - cmpult $4,$0,$1 - subq $3,$4,$4 - cmpult $3,$4,$0 - stq $4,0($16) - or $0,$1,$0 - ret $31,($26),1 - - .end __mpn_sub_n diff --git a/mpn/alpha/submul_1.asm b/mpn/alpha/submul_1.asm new file mode 100644 index 000000000..5122d9e80 --- /dev/null +++ b/mpn/alpha/submul_1.asm @@ -0,0 +1,87 @@ +dnl Alpha __mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Library General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Library General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_submul_1) + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + subq r5,r3,r3 + cmpult r5,r3,r4 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + addq r5,r0,r0 C combine carries + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r5,r0,r0 C combine carries + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r0,r5,r0 + ret r31,(r26),1 +EPILOGUE(mpn_submul_1) +ASM_END() diff --git a/mpn/alpha/submul_1.s b/mpn/alpha/submul_1.s deleted file mode 100644 index 319c10f07..000000000 --- a/mpn/alpha/submul_1.s +++ /dev/null @@ -1,92 +0,0 @@ - # Alpha __mpn_submul_1 -- Multiply a limb vector with a limb and - # subtract the result from a second limb vector. - - # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. - - # This file is part of the GNU MP Library. - - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your - # option) any later version. - - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public - # License for more details. - - # You should have received a copy of the GNU Library General Public License - # along with the GNU MP Library; see the file COPYING.LIB. If not, write to - # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - # MA 02111-1307, USA. - - - # INPUT PARAMETERS - # res_ptr r16 - # s1_ptr r17 - # size r18 - # s2_limb r19 - - # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5. - - .set noreorder - .set noat -.text - .align 3 - .globl __mpn_submul_1 - .ent __mpn_submul_1 2 -__mpn_submul_1: - .frame $30,0,$26 - - ldq $2,0($17) # $2 = s1_limb - addq $17,8,$17 # s1_ptr++ - subq $18,1,$18 # size-- - mulq $2,$19,$3 # $3 = prod_low - ldq $5,0($16) # $5 = *res_ptr - umulh $2,$19,$0 # $0 = prod_high - beq $18,.Lend1 # jump if size was == 1 - ldq $2,0($17) # $2 = s1_limb - addq $17,8,$17 # s1_ptr++ - subq $18,1,$18 # size-- - subq $5,$3,$3 - cmpult $5,$3,$4 - stq $3,0($16) - addq $16,8,$16 # res_ptr++ - beq $18,.Lend2 # jump if size was == 2 - - .align 3 -.Loop: mulq $2,$19,$3 # $3 = prod_low - ldq $5,0($16) # $5 = *res_ptr - addq $4,$0,$0 # cy_limb = cy_limb + 'cy' - subq $18,1,$18 # size-- - umulh $2,$19,$4 # $4 = cy_limb - ldq $2,0($17) # $2 = s1_limb - addq $17,8,$17 # s1_ptr++ - addq $3,$0,$3 # $3 = cy_limb + prod_low - cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) - subq $5,$3,$3 - cmpult $5,$3,$5 - stq $3,0($16) - addq $16,8,$16 # res_ptr++ - addq $5,$0,$0 # combine carries - bne $18,.Loop - -.Lend2: mulq $2,$19,$3 # $3 = prod_low - ldq $5,0($16) # $5 = *res_ptr - addq $4,$0,$0 # cy_limb = cy_limb + 'cy' - umulh $2,$19,$4 # $4 = cy_limb - addq $3,$0,$3 # $3 = cy_limb + prod_low - cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) - subq $5,$3,$3 - cmpult $5,$3,$5 - stq $3,0($16) - addq $5,$0,$0 # combine carries - addq $4,$0,$0 # cy_limb = prod_high + cy - ret $31,($26),1 -.Lend1: subq $5,$3,$3 - cmpult $5,$3,$5 - stq $3,0($16) - addq $0,$5,$0 - ret $31,($26),1 - - .end __mpn_submul_1 diff --git a/mpn/alpha/unicos.m4 b/mpn/alpha/unicos.m4 new file mode 100644 index 000000000..93d97b7e1 --- /dev/null +++ b/mpn/alpha/unicos.m4 @@ -0,0 +1,41 @@ +divert(-1) + +define(`ASM_START', + `.ident dummy') + +define(`X',`^X$1') +define(`INT64', + `dnl + .psect $1@crud,data +$1: .quad $2 + .endp') + +define(`PROLOGUE', + `dnl + .stack 192 ; What does this mean? Only Cray knows. + .psect $1@code,code,cache +$1::') +define(`PROLOGUE_GP', `PROLOGUE($1)') + +define(`EPILOGUE', + `dnl + .endp') + +define(`DATASTART', + `dnl + .psect $1@crud,data +$1:') +define(`DATAEND', + `dnl + .endp') + +define(`ASM_END', + `dnl + .end') + +define(`unop',`bis r31,r31,r31') ; Unicos assembler lacks unop + +define(`ALIGN',`') ; Unicos assembler seems to align using garbage + +divert + |