summaryrefslogtreecommitdiff
path: root/mpn/alpha
diff options
context:
space:
mode:
authortege <tege@gmplib.org>2000-03-17 07:08:03 +0100
committertege <tege@gmplib.org>2000-03-17 07:08:03 +0100
commit106b678e54ca07182002077f4421890710626aa4 (patch)
treed3fe95a81926ddd8bc57a083ec50485ffaffee65 /mpn/alpha
parentef6177983d51165d7e0ef15a52b63fea72169805 (diff)
downloadgmp-106b678e54ca07182002077f4421890710626aa4.tar.gz
* Convert `.s' files to `.asm'.
* Port to Cray T3D/E systems.
Diffstat (limited to 'mpn/alpha')
-rw-r--r--mpn/alpha/README10
-rw-r--r--mpn/alpha/add_n.asm114
-rw-r--r--mpn/alpha/add_n.s120
-rw-r--r--mpn/alpha/addmul_1.asm87
-rw-r--r--mpn/alpha/addmul_1.s92
-rw-r--r--mpn/alpha/cntlz.s70
-rw-r--r--mpn/alpha/default.m456
-rw-r--r--mpn/alpha/ev5/add_n.asm143
-rw-r--r--mpn/alpha/ev5/add_n.s148
-rw-r--r--mpn/alpha/ev5/lshift.asm169
-rw-r--r--mpn/alpha/ev5/lshift.s174
-rw-r--r--mpn/alpha/ev5/rshift.asm167
-rw-r--r--mpn/alpha/ev5/rshift.s172
-rw-r--r--mpn/alpha/ev5/sub_n.asm143
-rw-r--r--mpn/alpha/ev5/sub_n.s148
-rw-r--r--mpn/alpha/invert_limb.asm (renamed from mpn/alpha/invert-limb.s)171
-rw-r--r--mpn/alpha/lshift.asm104
-rw-r--r--mpn/alpha/lshift.s109
-rw-r--r--mpn/alpha/mul_1.asm71
-rw-r--r--mpn/alpha/mul_1.s85
-rw-r--r--mpn/alpha/rshift.asm102
-rw-r--r--mpn/alpha/rshift.s107
-rw-r--r--mpn/alpha/sub_n.asm114
-rw-r--r--mpn/alpha/sub_n.s120
-rw-r--r--mpn/alpha/submul_1.asm87
-rw-r--r--mpn/alpha/submul_1.s92
-rw-r--r--mpn/alpha/unicos.m441
27 files changed, 1489 insertions, 1527 deletions
diff --git a/mpn/alpha/README b/mpn/alpha/README
index fd9f78c79..a03153922 100644
--- a/mpn/alpha/README
+++ b/mpn/alpha/README
@@ -1,5 +1,15 @@
This directory contains mpn functions optimized for DEC Alpha processors.
+ALPHA ASSEMBLY RULES AND REGULATIONS
+
+The `.prologue N' pseudo op marks the end of instruction that needs
+special handling by unwinding. It also says whether $27 is really
+needed for computing the gp. The `.mask M' pseudo op says which
+registers are saved on the stack, and at what offset in the frame.
+
+Cray code is very very different...
+
+
RELEVANT OPTIMIZATION ISSUES
EV4
diff --git a/mpn/alpha/add_n.asm b/mpn/alpha/add_n.asm
new file mode 100644
index 000000000..1abfd2d42
--- /dev/null
+++ b/mpn/alpha/add_n.asm
@@ -0,0 +1,114 @@
+dnl Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ ldq r3,0(r17)
+ ldq r4,0(r18)
+
+ subq r19,1,r19
+ and r19,4-1,r2 C number of limbs in first loop
+ bis r31,r31,r0
+ beq r2,$L0 C if multiple of 4 limbs, skip first loop
+
+ subq r19,r2,r19
+
+$Loop0: subq r2,1,r2
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ addq r17,8,r17
+ addq r18,8,r18
+ bis r5,r5,r3
+ bis r6,r6,r4
+ addq r16,8,r16
+ bne r2,$Loop0
+
+$L0: beq r19,$Lend
+
+ ALIGN(8)
+$Loop: subq r19,4,r19
+
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ ldq r3,16(r17)
+ addq r6,r0,r6
+ ldq r4,16(r18)
+ cmpult r6,r0,r1
+ addq r5,r6,r6
+ cmpult r6,r5,r0
+ stq r6,8(r16)
+ bis r0,r1,r0
+
+ ldq r5,24(r17)
+ addq r4,r0,r4
+ ldq r6,24(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,16(r16)
+ bis r0,r1,r0
+
+ ldq r3,32(r17)
+ addq r6,r0,r6
+ ldq r4,32(r18)
+ cmpult r6,r0,r1
+ addq r5,r6,r6
+ cmpult r6,r5,r0
+ stq r6,24(r16)
+ bis r0,r1,r0
+
+ addq r17,32,r17
+ addq r18,32,r18
+ addq r16,32,r16
+ bne r19,$Loop
+
+$Lend: addq r4,r0,r4
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/mpn/alpha/add_n.s b/mpn/alpha/add_n.s
deleted file mode 100644
index 426556e39..000000000
--- a/mpn/alpha/add_n.s
+++ /dev/null
@@ -1,120 +0,0 @@
- # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
- # store sum in a third limb vector.
-
- # Copyright (C) 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
-
- # INPUT PARAMETERS
- # res_ptr $16
- # s1_ptr $17
- # s2_ptr $18
- # size $19
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_add_n
- .ent __mpn_add_n
-__mpn_add_n:
- .frame $30,0,$26,0
-
- ldq $3,0($17)
- ldq $4,0($18)
-
- subq $19,1,$19
- and $19,4-1,$2 # number of limbs in first loop
- bis $31,$31,$0
- beq $2,.L0 # if multiple of 4 limbs, skip first loop
-
- subq $19,$2,$19
-
-.Loop0: subq $2,1,$2
- ldq $5,8($17)
- addq $4,$0,$4
- ldq $6,8($18)
- cmpult $4,$0,$1
- addq $3,$4,$4
- cmpult $4,$3,$0
- stq $4,0($16)
- or $0,$1,$0
-
- addq $17,8,$17
- addq $18,8,$18
- bis $5,$5,$3
- bis $6,$6,$4
- addq $16,8,$16
- bne $2,.Loop0
-
-.L0: beq $19,.Lend
-
- .align 3
-.Loop: subq $19,4,$19
-
- ldq $5,8($17)
- addq $4,$0,$4
- ldq $6,8($18)
- cmpult $4,$0,$1
- addq $3,$4,$4
- cmpult $4,$3,$0
- stq $4,0($16)
- or $0,$1,$0
-
- ldq $3,16($17)
- addq $6,$0,$6
- ldq $4,16($18)
- cmpult $6,$0,$1
- addq $5,$6,$6
- cmpult $6,$5,$0
- stq $6,8($16)
- or $0,$1,$0
-
- ldq $5,24($17)
- addq $4,$0,$4
- ldq $6,24($18)
- cmpult $4,$0,$1
- addq $3,$4,$4
- cmpult $4,$3,$0
- stq $4,16($16)
- or $0,$1,$0
-
- ldq $3,32($17)
- addq $6,$0,$6
- ldq $4,32($18)
- cmpult $6,$0,$1
- addq $5,$6,$6
- cmpult $6,$5,$0
- stq $6,24($16)
- or $0,$1,$0
-
- addq $17,32,$17
- addq $18,32,$18
- addq $16,32,$16
- bne $19,.Loop
-
-.Lend: addq $4,$0,$4
- cmpult $4,$0,$1
- addq $3,$4,$4
- cmpult $4,$3,$0
- stq $4,0($16)
- or $0,$1,$0
- ret $31,($26),1
-
- .end __mpn_add_n
diff --git a/mpn/alpha/addmul_1.asm b/mpn/alpha/addmul_1.asm
new file mode 100644
index 000000000..0a42326b3
--- /dev/null
+++ b/mpn/alpha/addmul_1.asm
@@ -0,0 +1,87 @@
+dnl Alpha __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published
+dnl by the Free Software Foundation; either version 2 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl s2_limb r19
+
+dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Lend1 C jump if size was == 1
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ addq r5,r3,r3
+ cmpult r3,r5,r4
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ subq r18,1,r18 C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ addq r5,r0,r0 C combine carries
+ bne r18,$Loop
+
+$Lend2: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r5,r0,r0 C combine carries
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Lend1: addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r0,r5,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/mpn/alpha/addmul_1.s b/mpn/alpha/addmul_1.s
deleted file mode 100644
index 8513c13f5..000000000
--- a/mpn/alpha/addmul_1.s
+++ /dev/null
@@ -1,92 +0,0 @@
- # Alpha __mpn_addmul_1 -- Multiply a limb vector with a limb and add
- # the result to a second limb vector.
-
- # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # s2_limb r19
-
- # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_addmul_1
- .ent __mpn_addmul_1 2
-__mpn_addmul_1:
- .frame $30,0,$26
-
- ldq $2,0($17) # $2 = s1_limb
- addq $17,8,$17 # s1_ptr++
- subq $18,1,$18 # size--
- mulq $2,$19,$3 # $3 = prod_low
- ldq $5,0($16) # $5 = *res_ptr
- umulh $2,$19,$0 # $0 = prod_high
- beq $18,.Lend1 # jump if size was == 1
- ldq $2,0($17) # $2 = s1_limb
- addq $17,8,$17 # s1_ptr++
- subq $18,1,$18 # size--
- addq $5,$3,$3
- cmpult $3,$5,$4
- stq $3,0($16)
- addq $16,8,$16 # res_ptr++
- beq $18,.Lend2 # jump if size was == 2
-
- .align 3
-.Loop: mulq $2,$19,$3 # $3 = prod_low
- ldq $5,0($16) # $5 = *res_ptr
- addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
- subq $18,1,$18 # size--
- umulh $2,$19,$4 # $4 = cy_limb
- ldq $2,0($17) # $2 = s1_limb
- addq $17,8,$17 # s1_ptr++
- addq $3,$0,$3 # $3 = cy_limb + prod_low
- cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
- addq $5,$3,$3
- cmpult $3,$5,$5
- stq $3,0($16)
- addq $16,8,$16 # res_ptr++
- addq $5,$0,$0 # combine carries
- bne $18,.Loop
-
-.Lend2: mulq $2,$19,$3 # $3 = prod_low
- ldq $5,0($16) # $5 = *res_ptr
- addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
- umulh $2,$19,$4 # $4 = cy_limb
- addq $3,$0,$3 # $3 = cy_limb + prod_low
- cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
- addq $5,$3,$3
- cmpult $3,$5,$5
- stq $3,0($16)
- addq $5,$0,$0 # combine carries
- addq $4,$0,$0 # cy_limb = prod_high + cy
- ret $31,($26),1
-.Lend1: addq $5,$3,$3
- cmpult $3,$5,$5
- stq $3,0($16)
- addq $0,$5,$0
- ret $31,($26),1
-
- .end __mpn_addmul_1
diff --git a/mpn/alpha/cntlz.s b/mpn/alpha/cntlz.s
deleted file mode 100644
index e0f57c121..000000000
--- a/mpn/alpha/cntlz.s
+++ /dev/null
@@ -1,70 +0,0 @@
- # Alpha auxiliary for longlong.h's count_leading_zeros
-
- # Copyright (C) 1997 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
- # DISCUSSION:
-
- # Other methods have been tried, and using a 128-entry table actually trims
- # about 10% of the execution time (on a 21164) when the table is in the L1
- # cache. But under non-benchmarking conditions, the table will hardly be in
- # the L1 cache. Tricky bit-fiddling methods with multiplies and magic tables
- # are also possible, but they require many more instructions than the current
- # code. (But for count_trailing_zeros, such tricks are beneficial.)
- # Finally, converting to floating-point and extracting the exponent is much
- # slower.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __count_leading_zeros
- .ent __count_leading_zeros 0
-__count_leading_zeros:
- bis $31,63,$0 # initialize partial result count
-
- srl $16,32,$1 # shift down 32 steps -> r1
- cmovne $1,$1,$16 # select r1 if non-zero
- cmovne $1,31,$0 # if r1 is nonzero choose smaller count
-
- srl $16,16,$1 # shift down 16 steps -> r1
- subq $0,16,$2 # generate new partial result count
- cmovne $1,$1,$16 # choose new r1 if non-zero
- cmovne $1,$2,$0 # choose new count if r1 was non-zero
-
- srl $16,8,$1
- subq $0,8,$2
- cmovne $1,$1,$16
- cmovne $1,$2,$0
-
- srl $16,4,$1
- subq $0,4,$2
- cmovne $1,$1,$16
- cmovne $1,$2,$0
-
- srl $16,2,$1
- subq $0,2,$2
- cmovne $1,$1,$16
- cmovne $1,$2,$0
-
- srl $16,1,$1 # extract bit 1
- subq $0,$1,$0 # subtract it from partial result
-
- ret $31,($26),1
- .end __count_leading_zeros
diff --git a/mpn/alpha/default.m4 b/mpn/alpha/default.m4
new file mode 100644
index 000000000..9b7e61a28
--- /dev/null
+++ b/mpn/alpha/default.m4
@@ -0,0 +1,56 @@
+divert(-1)
+
+define(`ASM_START',
+ `
+ .set noreorder
+ .set noat')
+
+define(`X',`0x$1')
+define(`INT64',
+ `
+ .align 3
+$1: .quad $2')
+
+define(`PROLOGUE',
+ `
+ .text
+ .align 3
+ .globl $1
+ .ent $1
+$1:
+ .frame r30,0,r26
+ .prologue 0')
+
+define(`PROLOGUE_GP',
+ `
+ .text
+ .align 3
+ .globl $1
+ .ent $1
+$1:
+ ldgp r29,0(r27)
+ .frame r30,0,r26
+ .prologue 1')
+
+define(`EPILOGUE',
+ `
+ .end $1')
+
+dnl Map register names r0, r1, etc, to `$0', `$1', etc.
+dnl This is needed on all systems but Unicos
+forloop(i,0,31,
+`define(`r'i,``$''i)'
+)
+forloop(i,0,31,
+`define(`f'i,``$f''i)'
+)
+
+define(`DATASTART',
+ `dnl
+ DATA
+$1:')
+define(`DATAEND',`dnl')
+
+define(`ASM_END',`dnl')
+
+divert
diff --git a/mpn/alpha/ev5/add_n.asm b/mpn/alpha/ev5/add_n.asm
new file mode 100644
index 000000000..9b3484aa9
--- /dev/null
+++ b/mpn/alpha/ev5/add_n.asm
@@ -0,0 +1,143 @@
+dnl Alpha EV5 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ bis r31,r31,r25 C clear cy
+ subq r19,4,r19 C decr loop cnt
+ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ ldq r1,8(r18)
+ ldq r5,8(r17)
+ addq r17,32,r17 C update s1_ptr
+ ldq r2,16(r18)
+ addq r0,r4,r20 C 1st main add
+ ldq r3,24(r18)
+ subq r19,4,r19 C decr loop cnt
+ ldq r6,-16(r17)
+ cmpult r20,r0,r25 C compute cy from last add
+ ldq r7,-8(r17)
+ addq r1,r5,r28 C 2nd main add
+ addq r18,32,r18 C update s2_ptr
+ addq r28,r25,r21 C 2nd carry add
+ cmpult r28,r5,r8 C compute cy from last add
+ blt r19,$Lend1 C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+ ALIGN(16)
+$Loop: cmpult r21,r28,r25 C compute cy from last add
+ ldq r0,0(r18)
+ bis r8,r25,r25 C combine cy from the two adds
+ ldq r1,8(r18)
+ addq r2,r6,r28 C 3rd main add
+ ldq r4,0(r17)
+ addq r28,r25,r22 C 3rd carry add
+ ldq r5,8(r17)
+ cmpult r28,r6,r8 C compute cy from last add
+ cmpult r22,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+ stq r21,8(r16)
+ addq r3,r7,r28 C 4th main add
+ addq r28,r25,r23 C 4th carry add
+ cmpult r28,r7,r8 C compute cy from last add
+ cmpult r23,r28,r25 C compute cy from last add
+ addq r17,32,r17 C update s1_ptr
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,32,r16 C update res_ptr
+ addq r0,r4,r28 C 1st main add
+ ldq r2,16(r18)
+ addq r25,r28,r20 C 1st carry add
+ ldq r3,24(r18)
+ cmpult r28,r4,r8 C compute cy from last add
+ ldq r6,-16(r17)
+ cmpult r20,r28,r25 C compute cy from last add
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two adds
+ subq r19,4,r19 C decr loop cnt
+ stq r22,-16(r16)
+ addq r1,r5,r28 C 2nd main add
+ stq r23,-8(r16)
+ addq r25,r28,r21 C 2nd carry add
+ addq r18,32,r18 C update s2_ptr
+ cmpult r28,r5,r8 C compute cy from last add
+ bge r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1: cmpult r21,r28,r25 C compute cy from last add
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r2,r6,r28 C 3rd main add
+ addq r28,r25,r22 C 3rd carry add
+ cmpult r28,r6,r8 C compute cy from last add
+ cmpult r22,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+ stq r21,8(r16)
+ addq r3,r7,r28 C 4th main add
+ addq r28,r25,r23 C 4th carry add
+ cmpult r28,r7,r8 C compute cy from last add
+ cmpult r23,r28,r25 C compute cy from last add
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,32,r16 C update res_ptr
+ stq r22,-16(r16)
+ stq r23,-8(r16)
+$Lend2: addq r19,4,r19 C restore loop cnt
+ beq r19,$Lret
+C Start software pipeline for 2nd loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ subq r19,1,r19
+ beq r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+ ALIGN(16)
+$Loop0: addq r0,r4,r28 C main add
+ ldq r0,8(r18)
+ cmpult r28,r4,r8 C compute cy from last add
+ ldq r4,8(r17)
+ addq r28,r25,r20 C carry add
+ addq r18,8,r18
+ addq r17,8,r17
+ stq r20,0(r16)
+ cmpult r20,r28,r25 C compute cy from last add
+ subq r19,1,r19 C decr loop cnt
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,8,r16
+ bne r19,$Loop0
+$Lend0: addq r0,r4,r28 C main add
+ addq r28,r25,r20 C carry add
+ cmpult r28,r4,r8 C compute cy from last add
+ cmpult r20,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+
+$Lret: bis r25,r31,r0 C return cy
+ ret r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/mpn/alpha/ev5/add_n.s b/mpn/alpha/ev5/add_n.s
deleted file mode 100644
index 66bb9b9fb..000000000
--- a/mpn/alpha/ev5/add_n.s
+++ /dev/null
@@ -1,148 +0,0 @@
- # Alpha EV5 __mpn_add_n -- Add two limb vectors of the same length > 0 and
- # store sum in a third limb vector.
-
- # Copyright (C) 1995, 1999 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
-
- # INPUT PARAMETERS
- # res_ptr $16
- # s1_ptr $17
- # s2_ptr $18
- # size $19
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_add_n
- .ent __mpn_add_n
-__mpn_add_n:
- .frame $30,0,$26,0
-
- or $31,$31,$25 # clear cy
- subq $19,4,$19 # decr loop cnt
- blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
- # Start software pipeline for 1st loop
- ldq $0,0($18)
- ldq $4,0($17)
- ldq $1,8($18)
- ldq $5,8($17)
- addq $17,32,$17 # update s1_ptr
- ldq $2,16($18)
- addq $0,$4,$20 # 1st main add
- ldq $3,24($18)
- subq $19,4,$19 # decr loop cnt
- ldq $6,-16($17)
- cmpult $20,$0,$25 # compute cy from last add
- ldq $7,-8($17)
- addq $1,$5,$28 # 2nd main add
- addq $18,32,$18 # update s2_ptr
- addq $28,$25,$21 # 2nd carry add
- cmpult $28,$5,$8 # compute cy from last add
- blt $19,.Lend1 # if less than 4 limbs remain, jump
- # 1st loop handles groups of 4 limbs in a software pipeline
- .align 4
-.Loop: cmpult $21,$28,$25 # compute cy from last add
- ldq $0,0($18)
- or $8,$25,$25 # combine cy from the two adds
- ldq $1,8($18)
- addq $2,$6,$28 # 3rd main add
- ldq $4,0($17)
- addq $28,$25,$22 # 3rd carry add
- ldq $5,8($17)
- cmpult $28,$6,$8 # compute cy from last add
- cmpult $22,$28,$25 # compute cy from last add
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two adds
- stq $21,8($16)
- addq $3,$7,$28 # 4th main add
- addq $28,$25,$23 # 4th carry add
- cmpult $28,$7,$8 # compute cy from last add
- cmpult $23,$28,$25 # compute cy from last add
- addq $17,32,$17 # update s1_ptr
- or $8,$25,$25 # combine cy from the two adds
- addq $16,32,$16 # update res_ptr
- addq $0,$4,$28 # 1st main add
- ldq $2,16($18)
- addq $25,$28,$20 # 1st carry add
- ldq $3,24($18)
- cmpult $28,$4,$8 # compute cy from last add
- ldq $6,-16($17)
- cmpult $20,$28,$25 # compute cy from last add
- ldq $7,-8($17)
- or $8,$25,$25 # combine cy from the two adds
- subq $19,4,$19 # decr loop cnt
- stq $22,-16($16)
- addq $1,$5,$28 # 2nd main add
- stq $23,-8($16)
- addq $25,$28,$21 # 2nd carry add
- addq $18,32,$18 # update s2_ptr
- cmpult $28,$5,$8 # compute cy from last add
- bge $19,.Loop
- # Finish software pipeline for 1st loop
-.Lend1: cmpult $21,$28,$25 # compute cy from last add
- or $8,$25,$25 # combine cy from the two adds
- addq $2,$6,$28 # 3rd main add
- addq $28,$25,$22 # 3rd carry add
- cmpult $28,$6,$8 # compute cy from last add
- cmpult $22,$28,$25 # compute cy from last add
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two adds
- stq $21,8($16)
- addq $3,$7,$28 # 4th main add
- addq $28,$25,$23 # 4th carry add
- cmpult $28,$7,$8 # compute cy from last add
- cmpult $23,$28,$25 # compute cy from last add
- or $8,$25,$25 # combine cy from the two adds
- addq $16,32,$16 # update res_ptr
- stq $22,-16($16)
- stq $23,-8($16)
-.Lend2: addq $19,4,$19 # restore loop cnt
- beq $19,.Lret
- # Start software pipeline for 2nd loop
- ldq $0,0($18)
- ldq $4,0($17)
- subq $19,1,$19
- beq $19,.Lend0
- # 2nd loop handles remaining 1-3 limbs
- .align 4
-.Loop0: addq $0,$4,$28 # main add
- ldq $0,8($18)
- cmpult $28,$4,$8 # compute cy from last add
- ldq $4,8($17)
- addq $28,$25,$20 # carry add
- addq $18,8,$18
- addq $17,8,$17
- stq $20,0($16)
- cmpult $20,$28,$25 # compute cy from last add
- subq $19,1,$19 # decr loop cnt
- or $8,$25,$25 # combine cy from the two adds
- addq $16,8,$16
- bne $19,.Loop0
-.Lend0: addq $0,$4,$28 # main add
- addq $28,$25,$20 # carry add
- cmpult $28,$4,$8 # compute cy from last add
- cmpult $20,$28,$25 # compute cy from last add
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two adds
-
-.Lret: or $25,$31,$0 # return cy
- ret $31,($26),1
- .end __mpn_add_n
diff --git a/mpn/alpha/ev5/lshift.asm b/mpn/alpha/ev5/lshift.asm
new file mode 100644
index 000000000..23b9e8a10
--- /dev/null
+++ b/mpn/alpha/ev5/lshift.asm
@@ -0,0 +1,169 @@
+dnl Alpha EV5 __mpn_lshift -- Shift a number left.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ s8addq r18,r17,r17 C make r17 point at end of s1
+ ldq r4,-8(r17) C load first limb
+ subq r31,r19,r20
+ s8addq r18,r16,r16 C make r16 point at end of RES
+ subq r18,1,r18
+ and r18,4-1,r28 C number of limbs in first loop
+ srl r4,r20,r0 C compute function result
+
+ beq r28,$L0
+ subq r18,r28,r18
+
+ ALIGN(8)
+$Loop0: ldq r3,-16(r17)
+ subq r16,8,r16
+ sll r4,r19,r5
+ subq r17,8,r17
+ subq r28,1,r28
+ srl r3,r20,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,0(r16)
+ bne r28,$Loop0
+
+$L0: sll r4,r19,r24
+ beq r18,$Lend
+C warm up phase 1
+ ldq r1,-16(r17)
+ subq r18,4,r18
+ ldq r2,-24(r17)
+ ldq r3,-32(r17)
+ ldq r4,-40(r17)
+ beq r18,$Lend1
+C warm up phase 2
+ srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ ldq r1,-48(r17)
+ sll r2,r19,r22
+ ldq r2,-56(r17)
+ srl r3,r20,r5
+ bis r7,r24,r7
+ sll r3,r19,r23
+ bis r8,r21,r8
+ srl r4,r20,r6
+ ldq r3,-64(r17)
+ sll r4,r19,r24
+ ldq r4,-72(r17)
+ subq r18,4,r18
+ beq r18,$Lend2
+ ALIGN(16)
+C main loop
+$Loop: stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+
+ srl r1,r20,r7
+ subq r18,4,r18
+ sll r1,r19,r21
+ unop C ldq r31,-96(r17)
+
+ srl r2,r20,r8
+ ldq r1,-80(r17)
+ sll r2,r19,r22
+ ldq r2,-88(r17)
+
+ stq r5,-24(r16)
+ bis r7,r24,r7
+ stq r6,-32(r16)
+ bis r8,r21,r8
+
+ srl r3,r20,r5
+ unop C ldq r31,-96(r17)
+ sll r3,r19,r23
+ subq r16,32,r16
+
+ srl r4,r20,r6
+ ldq r3,-96(r17)
+ sll r4,r19,r24
+ ldq r4,-104(r17)
+
+ subq r17,32,r17
+ bne r18,$Loop
+C cool down phase 2/1
+$Lend2: stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+ srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ sll r2,r19,r22
+ stq r5,-24(r16)
+ bis r7,r24,r7
+ stq r6,-32(r16)
+ bis r8,r21,r8
+ srl r3,r20,r5
+ sll r3,r19,r23
+ srl r4,r20,r6
+ sll r4,r19,r24
+C cool down phase 2/2
+ stq r7,-40(r16)
+ bis r5,r22,r5
+ stq r8,-48(r16)
+ bis r6,r23,r6
+ stq r5,-56(r16)
+ stq r6,-64(r16)
+C cool down phase 2/3
+ stq r24,-72(r16)
+ ret r31,(r26),1
+
+C cool down phase 1/1
+$Lend1: srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ sll r2,r19,r22
+ srl r3,r20,r5
+ bis r7,r24,r7
+ sll r3,r19,r23
+ bis r8,r21,r8
+ srl r4,r20,r6
+ sll r4,r19,r24
+C cool down phase 1/2
+ stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+ stq r5,-24(r16)
+ stq r6,-32(r16)
+ stq r24,-40(r16)
+ ret r31,(r26),1
+
+$Lend: stq r24,-8(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/mpn/alpha/ev5/lshift.s b/mpn/alpha/ev5/lshift.s
deleted file mode 100644
index ced55b720..000000000
--- a/mpn/alpha/ev5/lshift.s
+++ /dev/null
@@ -1,174 +0,0 @@
- # Alpha EV5 __mpn_lshift --
-
- # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # cnt r19
-
- # This code runs at 3.25 cycles/limb on the EV5.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_lshift
- .ent __mpn_lshift
-__mpn_lshift:
- .frame $30,0,$26,0
-
- s8addq $18,$17,$17 # make r17 point at end of s1
- ldq $4,-8($17) # load first limb
- subq $31,$19,$20
- s8addq $18,$16,$16 # make r16 point at end of RES
- subq $18,1,$18
- and $18,4-1,$28 # number of limbs in first loop
- srl $4,$20,$0 # compute function result
-
- beq $28,.L0
- subq $18,$28,$18
-
- .align 3
-.Loop0: ldq $3,-16($17)
- subq $16,8,$16
- sll $4,$19,$5
- subq $17,8,$17
- subq $28,1,$28
- srl $3,$20,$6
- or $3,$3,$4
- or $5,$6,$8
- stq $8,0($16)
- bne $28,.Loop0
-
-.L0: sll $4,$19,$24
- beq $18,.Lend
- # warm up phase 1
- ldq $1,-16($17)
- subq $18,4,$18
- ldq $2,-24($17)
- ldq $3,-32($17)
- ldq $4,-40($17)
- beq $18,.Lend1
- # warm up phase 2
- srl $1,$20,$7
- sll $1,$19,$21
- srl $2,$20,$8
- ldq $1,-48($17)
- sll $2,$19,$22
- ldq $2,-56($17)
- srl $3,$20,$5
- or $7,$24,$7
- sll $3,$19,$23
- or $8,$21,$8
- srl $4,$20,$6
- ldq $3,-64($17)
- sll $4,$19,$24
- ldq $4,-72($17)
- subq $18,4,$18
- beq $18,.Lend2
- .align 4
- # main loop
-.Loop: stq $7,-8($16)
- or $5,$22,$5
- stq $8,-16($16)
- or $6,$23,$6
-
- srl $1,$20,$7
- subq $18,4,$18
- sll $1,$19,$21
- unop # ldq $31,-96($17)
-
- srl $2,$20,$8
- ldq $1,-80($17)
- sll $2,$19,$22
- ldq $2,-88($17)
-
- stq $5,-24($16)
- or $7,$24,$7
- stq $6,-32($16)
- or $8,$21,$8
-
- srl $3,$20,$5
- unop # ldq $31,-96($17)
- sll $3,$19,$23
- subq $16,32,$16
-
- srl $4,$20,$6
- ldq $3,-96($17)
- sll $4,$19,$24
- ldq $4,-104($17)
-
- subq $17,32,$17
- bne $18,.Loop
- # cool down phase 2/1
-.Lend2: stq $7,-8($16)
- or $5,$22,$5
- stq $8,-16($16)
- or $6,$23,$6
- srl $1,$20,$7
- sll $1,$19,$21
- srl $2,$20,$8
- sll $2,$19,$22
- stq $5,-24($16)
- or $7,$24,$7
- stq $6,-32($16)
- or $8,$21,$8
- srl $3,$20,$5
- sll $3,$19,$23
- srl $4,$20,$6
- sll $4,$19,$24
- # cool down phase 2/2
- stq $7,-40($16)
- or $5,$22,$5
- stq $8,-48($16)
- or $6,$23,$6
- stq $5,-56($16)
- stq $6,-64($16)
- # cool down phase 2/3
- stq $24,-72($16)
- ret $31,($26),1
-
- # cool down phase 1/1
-.Lend1: srl $1,$20,$7
- sll $1,$19,$21
- srl $2,$20,$8
- sll $2,$19,$22
- srl $3,$20,$5
- or $7,$24,$7
- sll $3,$19,$23
- or $8,$21,$8
- srl $4,$20,$6
- sll $4,$19,$24
- # cool down phase 1/2
- stq $7,-8($16)
- or $5,$22,$5
- stq $8,-16($16)
- or $6,$23,$6
- stq $5,-24($16)
- stq $6,-32($16)
- stq $24,-40($16)
- ret $31,($26),1
-
-.Lend: stq $24,-8($16)
- ret $31,($26),1
- .end __mpn_lshift
diff --git a/mpn/alpha/ev5/rshift.asm b/mpn/alpha/ev5/rshift.asm
new file mode 100644
index 000000000..c3325579f
--- /dev/null
+++ b/mpn/alpha/ev5/rshift.asm
@@ -0,0 +1,167 @@
+dnl Alpha EV5 __mpn_rshift -- Shift a number right.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ldq r4,0(r17) C load first limb
+ subq r31,r19,r20
+ subq r18,1,r18
+ and r18,4-1,r28 C number of limbs in first loop
+ sll r4,r20,r0 C compute function result
+
+ beq r28,$L0
+ subq r18,r28,r18
+
+ ALIGN(8)
+$Loop0: ldq r3,8(r17)
+ addq r16,8,r16
+ srl r4,r19,r5
+ addq r17,8,r17
+ subq r28,1,r28
+ sll r3,r20,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,-8(r16)
+ bne r28,$Loop0
+
+$L0: srl r4,r19,r24
+ beq r18,$Lend
+C warm up phase 1
+ ldq r1,8(r17)
+ subq r18,4,r18
+ ldq r2,16(r17)
+ ldq r3,24(r17)
+ ldq r4,32(r17)
+ beq r18,$Lend1
+C warm up phase 2
+ sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ ldq r1,40(r17)
+ srl r2,r19,r22
+ ldq r2,48(r17)
+ sll r3,r20,r5
+ bis r7,r24,r7
+ srl r3,r19,r23
+ bis r8,r21,r8
+ sll r4,r20,r6
+ ldq r3,56(r17)
+ srl r4,r19,r24
+ ldq r4,64(r17)
+ subq r18,4,r18
+ beq r18,$Lend2
+ ALIGN(16)
+C main loop
+$Loop: stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+
+ sll r1,r20,r7
+ subq r18,4,r18
+ srl r1,r19,r21
+ unop C ldq r31,-96(r17)
+
+ sll r2,r20,r8
+ ldq r1,72(r17)
+ srl r2,r19,r22
+ ldq r2,80(r17)
+
+ stq r5,16(r16)
+ bis r7,r24,r7
+ stq r6,24(r16)
+ bis r8,r21,r8
+
+ sll r3,r20,r5
+ unop C ldq r31,-96(r17)
+ srl r3,r19,r23
+ addq r16,32,r16
+
+ sll r4,r20,r6
+ ldq r3,88(r17)
+ srl r4,r19,r24
+ ldq r4,96(r17)
+
+ addq r17,32,r17
+ bne r18,$Loop
+C cool down phase 2/1
+$Lend2: stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+ sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ srl r2,r19,r22
+ stq r5,16(r16)
+ bis r7,r24,r7
+ stq r6,24(r16)
+ bis r8,r21,r8
+ sll r3,r20,r5
+ srl r3,r19,r23
+ sll r4,r20,r6
+ srl r4,r19,r24
+C cool down phase 2/2
+ stq r7,32(r16)
+ bis r5,r22,r5
+ stq r8,40(r16)
+ bis r6,r23,r6
+ stq r5,48(r16)
+ stq r6,56(r16)
+C cool down phase 2/3
+ stq r24,64(r16)
+ ret r31,(r26),1
+
+C cool down phase 1/1
+$Lend1: sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ srl r2,r19,r22
+ sll r3,r20,r5
+ bis r7,r24,r7
+ srl r3,r19,r23
+ bis r8,r21,r8
+ sll r4,r20,r6
+ srl r4,r19,r24
+C cool down phase 1/2
+ stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+ stq r5,16(r16)
+ stq r6,24(r16)
+ stq r24,32(r16)
+ ret r31,(r26),1
+
+$Lend: stq r24,0(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/mpn/alpha/ev5/rshift.s b/mpn/alpha/ev5/rshift.s
deleted file mode 100644
index 6e24fef96..000000000
--- a/mpn/alpha/ev5/rshift.s
+++ /dev/null
@@ -1,172 +0,0 @@
- # Alpha EV5 __mpn_rshift --
-
- # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # cnt r19
-
- # This code runs at 3.25 cycles/limb on the EV5.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_rshift
- .ent __mpn_rshift
-__mpn_rshift:
- .frame $30,0,$26,0
-
- ldq $4,0($17) # load first limb
- subq $31,$19,$20
- subq $18,1,$18
- and $18,4-1,$28 # number of limbs in first loop
- sll $4,$20,$0 # compute function result
-
- beq $28,.L0
- subq $18,$28,$18
-
- .align 3
-.Loop0: ldq $3,8($17)
- addq $16,8,$16
- srl $4,$19,$5
- addq $17,8,$17
- subq $28,1,$28
- sll $3,$20,$6
- or $3,$3,$4
- or $5,$6,$8
- stq $8,-8($16)
- bne $28,.Loop0
-
-.L0: srl $4,$19,$24
- beq $18,.Lend
- # warm up phase 1
- ldq $1,8($17)
- subq $18,4,$18
- ldq $2,16($17)
- ldq $3,24($17)
- ldq $4,32($17)
- beq $18,.Lend1
- # warm up phase 2
- sll $1,$20,$7
- srl $1,$19,$21
- sll $2,$20,$8
- ldq $1,40($17)
- srl $2,$19,$22
- ldq $2,48($17)
- sll $3,$20,$5
- or $7,$24,$7
- srl $3,$19,$23
- or $8,$21,$8
- sll $4,$20,$6
- ldq $3,56($17)
- srl $4,$19,$24
- ldq $4,64($17)
- subq $18,4,$18
- beq $18,.Lend2
- .align 4
- # main loop
-.Loop: stq $7,0($16)
- or $5,$22,$5
- stq $8,8($16)
- or $6,$23,$6
-
- sll $1,$20,$7
- subq $18,4,$18
- srl $1,$19,$21
- unop # ldq $31,-96($17)
-
- sll $2,$20,$8
- ldq $1,72($17)
- srl $2,$19,$22
- ldq $2,80($17)
-
- stq $5,16($16)
- or $7,$24,$7
- stq $6,24($16)
- or $8,$21,$8
-
- sll $3,$20,$5
- unop # ldq $31,-96($17)
- srl $3,$19,$23
- addq $16,32,$16
-
- sll $4,$20,$6
- ldq $3,88($17)
- srl $4,$19,$24
- ldq $4,96($17)
-
- addq $17,32,$17
- bne $18,.Loop
- # cool down phase 2/1
-.Lend2: stq $7,0($16)
- or $5,$22,$5
- stq $8,8($16)
- or $6,$23,$6
- sll $1,$20,$7
- srl $1,$19,$21
- sll $2,$20,$8
- srl $2,$19,$22
- stq $5,16($16)
- or $7,$24,$7
- stq $6,24($16)
- or $8,$21,$8
- sll $3,$20,$5
- srl $3,$19,$23
- sll $4,$20,$6
- srl $4,$19,$24
- # cool down phase 2/2
- stq $7,32($16)
- or $5,$22,$5
- stq $8,40($16)
- or $6,$23,$6
- stq $5,48($16)
- stq $6,56($16)
- # cool down phase 2/3
- stq $24,64($16)
- ret $31,($26),1
-
- # cool down phase 1/1
-.Lend1: sll $1,$20,$7
- srl $1,$19,$21
- sll $2,$20,$8
- srl $2,$19,$22
- sll $3,$20,$5
- or $7,$24,$7
- srl $3,$19,$23
- or $8,$21,$8
- sll $4,$20,$6
- srl $4,$19,$24
- # cool down phase 1/2
- stq $7,0($16)
- or $5,$22,$5
- stq $8,8($16)
- or $6,$23,$6
- stq $5,16($16)
- stq $6,24($16)
- stq $24,32($16)
- ret $31,($26),1
-
-.Lend: stq $24,0($16)
- ret $31,($26),1
- .end __mpn_rshift
diff --git a/mpn/alpha/ev5/sub_n.asm b/mpn/alpha/ev5/sub_n.asm
new file mode 100644
index 000000000..213c2c885
--- /dev/null
+++ b/mpn/alpha/ev5/sub_n.asm
@@ -0,0 +1,143 @@
+dnl Alpha EV5 __mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ bis r31,r31,r25 C clear cy
+ subq r19,4,r19 C decr loop cnt
+ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ ldq r1,8(r18)
+ ldq r5,8(r17)
+ addq r17,32,r17 C update s1_ptr
+ ldq r2,16(r18)
+ subq r4,r0,r20 C 1st main subtract
+ ldq r3,24(r18)
+ subq r19,4,r19 C decr loop cnt
+ ldq r6,-16(r17)
+ cmpult r4,r0,r25 C compute cy from last subtract
+ ldq r7,-8(r17)
+ subq r5,r1,r28 C 2nd main subtract
+ addq r18,32,r18 C update s2_ptr
+ subq r28,r25,r21 C 2nd carry subtract
+ cmpult r5,r1,r8 C compute cy from last subtract
+ blt r19,$Lend1 C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+ ALIGN(16)
+$Loop: cmpult r28,r25,r25 C compute cy from last subtract
+ ldq r0,0(r18)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ ldq r1,8(r18)
+ subq r6,r2,r28 C 3rd main subtract
+ ldq r4,0(r17)
+ subq r28,r25,r22 C 3rd carry subtract
+ ldq r5,8(r17)
+ cmpult r6,r2,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ stq r21,8(r16)
+ subq r7,r3,r28 C 4th main subtract
+ subq r28,r25,r23 C 4th carry subtract
+ cmpult r7,r3,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ addq r17,32,r17 C update s1_ptr
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,32,r16 C update res_ptr
+ subq r4,r0,r28 C 1st main subtract
+ ldq r2,16(r18)
+ subq r28,r25,r20 C 1st carry subtract
+ ldq r3,24(r18)
+ cmpult r4,r0,r8 C compute cy from last subtract
+ ldq r6,-16(r17)
+ cmpult r28,r25,r25 C compute cy from last subtract
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ subq r19,4,r19 C decr loop cnt
+ stq r22,-16(r16)
+ subq r5,r1,r28 C 2nd main subtract
+ stq r23,-8(r16)
+ subq r28,r25,r21 C 2nd carry subtract
+ addq r18,32,r18 C update s2_ptr
+ cmpult r5,r1,r8 C compute cy from last subtract
+ bge r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1: cmpult r28,r25,r25 C compute cy from last subtract
+ bis r8,r25,r25 C combine cy from the two subtracts
+ subq r6,r2,r28 C cy add
+ subq r28,r25,r22 C 3rd main subtract
+ cmpult r6,r2,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ stq r21,8(r16)
+ subq r7,r3,r28 C cy add
+ subq r28,r25,r23 C 4th main subtract
+ cmpult r7,r3,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,32,r16 C update res_ptr
+ stq r22,-16(r16)
+ stq r23,-8(r16)
+$Lend2: addq r19,4,r19 C restore loop cnt
+ beq r19,$Lret
+C Start software pipeline for 2nd loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ subq r19,1,r19
+ beq r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+ ALIGN(16)
+$Loop0: subq r4,r0,r28 C main subtract
+ cmpult r4,r0,r8 C compute cy from last subtract
+ ldq r0,8(r18)
+ ldq r4,8(r17)
+ subq r28,r25,r20 C carry subtract
+ addq r18,8,r18
+ addq r17,8,r17
+ stq r20,0(r16)
+ cmpult r28,r25,r25 C compute cy from last subtract
+ subq r19,1,r19 C decr loop cnt
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,8,r16
+ bne r19,$Loop0
+$Lend0: subq r4,r0,r28 C main subtract
+ subq r28,r25,r20 C carry subtract
+ cmpult r4,r0,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+
+$Lret: bis r25,r31,r0 C return cy
+ ret r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/mpn/alpha/ev5/sub_n.s b/mpn/alpha/ev5/sub_n.s
deleted file mode 100644
index 36994b956..000000000
--- a/mpn/alpha/ev5/sub_n.s
+++ /dev/null
@@ -1,148 +0,0 @@
- # Alpha EV5 __mpn_sub_n -- Subtract two limb vectors of the same length > 0
- # and store difference in a third limb vector.
-
- # Copyright (C) 1995, 1999 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
-
- # INPUT PARAMETERS
- # res_ptr $16
- # s1_ptr $17
- # s2_ptr $18
- # size $19
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_sub_n
- .ent __mpn_sub_n
-__mpn_sub_n:
- .frame $30,0,$26,0
-
- or $31,$31,$25 # clear cy
- subq $19,4,$19 # decr loop cnt
- blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
- # Start software pipeline for 1st loop
- ldq $0,0($18)
- ldq $4,0($17)
- ldq $1,8($18)
- ldq $5,8($17)
- addq $17,32,$17 # update s1_ptr
- ldq $2,16($18)
- subq $4,$0,$20 # 1st main subtract
- ldq $3,24($18)
- subq $19,4,$19 # decr loop cnt
- ldq $6,-16($17)
- cmpult $4,$0,$25 # compute cy from last subtract
- ldq $7,-8($17)
- subq $5,$1,$28 # 2nd main subtract
- addq $18,32,$18 # update s2_ptr
- subq $28,$25,$21 # 2nd carry subtract
- cmpult $5,$1,$8 # compute cy from last subtract
- blt $19,.Lend1 # if less than 4 limbs remain, jump
- # 1st loop handles groups of 4 limbs in a software pipeline
- .align 4
-.Loop: cmpult $28,$25,$25 # compute cy from last subtract
- ldq $0,0($18)
- or $8,$25,$25 # combine cy from the two subtracts
- ldq $1,8($18)
- subq $6,$2,$28 # 3rd main subtract
- ldq $4,0($17)
- subq $28,$25,$22 # 3rd carry subtract
- ldq $5,8($17)
- cmpult $6,$2,$8 # compute cy from last subtract
- cmpult $28,$25,$25 # compute cy from last subtract
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two subtracts
- stq $21,8($16)
- subq $7,$3,$28 # 4th main subtract
- subq $28,$25,$23 # 4th carry subtract
- cmpult $7,$3,$8 # compute cy from last subtract
- cmpult $28,$25,$25 # compute cy from last subtract
- addq $17,32,$17 # update s1_ptr
- or $8,$25,$25 # combine cy from the two subtracts
- addq $16,32,$16 # update res_ptr
- subq $4,$0,$28 # 1st main subtract
- ldq $2,16($18)
- subq $28,$25,$20 # 1st carry subtract
- ldq $3,24($18)
- cmpult $4,$0,$8 # compute cy from last subtract
- ldq $6,-16($17)
- cmpult $28,$25,$25 # compute cy from last subtract
- ldq $7,-8($17)
- or $8,$25,$25 # combine cy from the two subtracts
- subq $19,4,$19 # decr loop cnt
- stq $22,-16($16)
- subq $5,$1,$28 # 2nd main subtract
- stq $23,-8($16)
- subq $28,$25,$21 # 2nd carry subtract
- addq $18,32,$18 # update s2_ptr
- cmpult $5,$1,$8 # compute cy from last subtract
- bge $19,.Loop
- # Finish software pipeline for 1st loop
-.Lend1: cmpult $28,$25,$25 # compute cy from last subtract
- or $8,$25,$25 # combine cy from the two subtracts
- subq $6,$2,$28 # cy add
- subq $28,$25,$22 # 3rd main subtract
- cmpult $6,$2,$8 # compute cy from last subtract
- cmpult $28,$25,$25 # compute cy from last subtract
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two subtracts
- stq $21,8($16)
- subq $7,$3,$28 # cy add
- subq $28,$25,$23 # 4th main subtract
- cmpult $7,$3,$8 # compute cy from last subtract
- cmpult $28,$25,$25 # compute cy from last subtract
- or $8,$25,$25 # combine cy from the two subtracts
- addq $16,32,$16 # update res_ptr
- stq $22,-16($16)
- stq $23,-8($16)
-.Lend2: addq $19,4,$19 # restore loop cnt
- beq $19,.Lret
- # Start software pipeline for 2nd loop
- ldq $0,0($18)
- ldq $4,0($17)
- subq $19,1,$19
- beq $19,.Lend0
- # 2nd loop handles remaining 1-3 limbs
- .align 4
-.Loop0: subq $4,$0,$28 # main subtract
- cmpult $4,$0,$8 # compute cy from last subtract
- ldq $0,8($18)
- ldq $4,8($17)
- subq $28,$25,$20 # carry subtract
- addq $18,8,$18
- addq $17,8,$17
- stq $20,0($16)
- cmpult $28,$25,$25 # compute cy from last subtract
- subq $19,1,$19 # decr loop cnt
- or $8,$25,$25 # combine cy from the two subtracts
- addq $16,8,$16
- bne $19,.Loop0
-.Lend0: subq $4,$0,$28 # main subtract
- subq $28,$25,$20 # carry subtract
- cmpult $4,$0,$8 # compute cy from last subtract
- cmpult $28,$25,$25 # compute cy from last subtract
- stq $20,0($16)
- or $8,$25,$25 # combine cy from the two subtracts
-
-.Lret: or $25,$31,$0 # return cy
- ret $31,($26),1
- .end __mpn_sub_n
diff --git a/mpn/alpha/invert-limb.s b/mpn/alpha/invert_limb.asm
index 9706f4b76..9e5cb22a1 100644
--- a/mpn/alpha/invert-limb.s
+++ b/mpn/alpha/invert_limb.asm
@@ -1,101 +1,90 @@
- # Alpha mpn_invert_normalized_limb -- Invert a normalized limb.
+dnl Alpha mpn_invert_normalized_limb -- Invert a normalized limb.
- # Copyright (C) 1996 Free Software Foundation, Inc.
+dnl Copyright (C) 1996, 2000 Free Software Foundation, Inc.
- # This file is part of the GNU MP Library.
+dnl This file is part of the GNU MP Library.
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or (at your
+dnl option) any later version.
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
- #
- # This is based on sophie:/gmp-stuff/dbg-inv-limb.c.
- # The ideas are due to Peter L. Montgomery
- #
- # The table below uses 4096 bytes. The file mentioned above has an
- # alternative function that doesn't require the table, but it runs 50%
- # slower than this.
+dnl
+dnl This is based on sophie:/gmp-stuff/dbg-inv-limb.c.
+dnl The ideas are due to Peter L. Montgomery
+dnl
+dnl The table below uses 4096 bytes. The file mentioned above has an
+dnl alternative function that doesn't require the table, but it runs 50%
+dnl slower than this.
- .set noreorder
- .set volatile
- .set noat
-.text
- .align 3
-$C36:
- .t_floating 9223372036854775808.0
-.text
- .align 3
- .globl __mpn_invert_normalized_limb
- .ent __mpn_invert_normalized_limb
-__mpn_invert_normalized_limb:
- ldgp $29,0($27)
-__mpn_invert_normalized_limb..ng:
- lda $30,-16($30)
- .frame $30,16,$26,0
- .prologue 1
- addq $16,$16,$1
- bne $1,$73
- lda $0,-1
- br $31,.Lend
+include(`../config.m4')
+
+ASM_START()
+
+INT64($C36,X(43e0000000000000)) C 2^63
+
+PROLOGUE_GP(mpn_invert_normalized_limb)
+ lda r30,-16(r30)
+ addq r16,r16,r1
+ bne r1,$73
+ lda r0,-1
+ br r31,$Lend
$73:
- srl $16,1,$1
- stq $1,0($30)
- ldt $f11,0($30)
- cvtqt $f11,$f1
- lda $1,$C36
- ldt $f10,0($1)
- divt $f10,$f1,$f10
- lda $2,invtab-4096
- srl $16,52,$1
- addq $1,$1,$1
- addq $1,$2,$1
- bic $1,6,$2
- ldq $2,0($2)
- bic $1,1,$1
- extwl $2,$1,$2
- sll $2,48,$0
- umulh $16,$0,$1
- addq $16,$1,$3
- stq $3,0($30)
- ldt $f11,0($30)
- cvtqt $f11,$f1
- mult $f1,$f10,$f1
- cvttqc $f1,$f1
- stt $f1,0($30)
- ldq $4,0($30)
- subq $0,$4,$0
- umulh $16,$0,$1
- mulq $16,$0,$2
- addq $16,$1,$3
- bge $3,.Loop2
-.Loop1: addq $2,$16,$2
- cmpult $2,$16,$1
- addq $3,$1,$3
- addq $0,1,$0
- blt $3,.Loop1
-.Loop2: cmpult $2,$16,$1
- subq $0,1,$0
- subq $3,$1,$3
- subq $2,$16,$2
- bge $3,.Loop2
-.Lend:
- addq $30,16,$30
- ret $31,($26),1
- .end __mpn_invert_normalized_limb
-.text
- .align 1
-invtab:
+ srl r16,1,r1
+ stq r1,0(r30)
+ ldt f11,0(r30)
+ cvtqt f11,f1
+ lda r1,$C36
+ ldt f10,0(r1)
+ divt f10,f1,f10
+ lda r2,$invtab-4096
+ srl r16,52,r1
+ addq r1,r1,r1
+ addq r1,r2,r1
+ bic r1,6,r2
+ ldq r2,0(r2)
+ bic r1,1,r1
+ extwl r2,r1,r2
+ sll r2,48,r0
+ umulh r16,r0,r1
+ addq r16,r1,r3
+ stq r3,0(r30)
+ ldt f11,0(r30)
+ cvtqt f11,f1
+ mult f1,f10,f1
+ cvttq/c f1,f1
+ stt f1,0(r30)
+ ldq r4,0(r30)
+ subq r0,r4,r0
+ umulh r16,r0,r1
+ mulq r16,r0,r2
+ addq r16,r1,r3
+ bge r3,$Loop2
+$Loop1: addq r2,r16,r2
+ cmpult r2,r16,r1
+ addq r3,r1,r3
+ addq r0,1,r0
+ blt r3,$Loop1
+$Loop2: cmpult r2,r16,r1
+ subq r0,1,r0
+ subq r3,r1,r3
+ subq r2,r16,r2
+ bge r3,$Loop2
+$Lend:
+ lda r30,16(r30)
+ ret r31,(r26),1
+EPILOGUE(mpn_invert_normalized_limb)
+DATASTART(`$invtab',4)
.word 0xffff,0xffc0,0xff80,0xff40,0xff00,0xfec0,0xfe81,0xfe41
.word 0xfe01,0xfdc2,0xfd83,0xfd43,0xfd04,0xfcc5,0xfc86,0xfc46
.word 0xfc07,0xfbc8,0xfb8a,0xfb4b,0xfb0c,0xfacd,0xfa8e,0xfa50
@@ -352,3 +341,5 @@ invtab:
.word 0x0182,0x0172,0x0161,0x0151,0x0141,0x0131,0x0121,0x0111
.word 0x0101,0x00f0,0x00e0,0x00d0,0x00c0,0x00b0,0x00a0,0x0090
.word 0x0080,0x0070,0x0060,0x0050,0x0040,0x0030,0x0020,0x0010
+DATAEND()
+ASM_END()
diff --git a/mpn/alpha/lshift.asm b/mpn/alpha/lshift.asm
new file mode 100644
index 000000000..de0ce473c
--- /dev/null
+++ b/mpn/alpha/lshift.asm
@@ -0,0 +1,104 @@
+dnl Alpha mpn_lshift -- Shift a number left.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
+dnl it would take 4 cycles/limb. It should be possible to get down to 3
+dnl cycles/limb since both ldq and stq can be paired with the other used
+dnl instructions. But there are many restrictions in the 21064 pipeline that
+dnl makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl 2. Only aligned instruction pairs can be paired.
+dnl 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ s8addq r18,r17,r17 C make r17 point at end of s1
+ ldq r4,-8(r17) C load first limb
+ subq r17,8,r17
+ subq r31,r19,r7
+ s8addq r18,r16,r16 C make r16 point at end of RES
+ subq r18,1,r18
+ and r18,4-1,r20 C number of limbs in first loop
+ srl r4,r7,r0 C compute function result
+
+ beq r20,$L0
+ subq r18,r20,r18
+
+ ALIGN(8)
+$Loop0:
+ ldq r3,-8(r17)
+ subq r16,8,r16
+ subq r17,8,r17
+ subq r20,1,r20
+ sll r4,r19,r5
+ srl r3,r7,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,0(r16)
+ bne r20,$Loop0
+
+$L0: beq r18,$Lend
+
+ ALIGN(8)
+$Loop: ldq r3,-8(r17)
+ subq r16,32,r16
+ subq r18,4,r18
+ sll r4,r19,r5
+ srl r3,r7,r6
+
+ ldq r4,-16(r17)
+ sll r3,r19,r1
+ bis r5,r6,r8
+ stq r8,24(r16)
+ srl r4,r7,r2
+
+ ldq r3,-24(r17)
+ sll r4,r19,r5
+ bis r1,r2,r8
+ stq r8,16(r16)
+ srl r3,r7,r6
+
+ ldq r4,-32(r17)
+ sll r3,r19,r1
+ bis r5,r6,r8
+ stq r8,8(r16)
+ srl r4,r7,r2
+
+ subq r17,32,r17
+ bis r1,r2,r8
+ stq r8,0(r16)
+
+ bgt r18,$Loop
+
+$Lend: sll r4,r19,r8
+ stq r8,-8(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/mpn/alpha/lshift.s b/mpn/alpha/lshift.s
deleted file mode 100644
index 6a3e55a93..000000000
--- a/mpn/alpha/lshift.s
+++ /dev/null
@@ -1,109 +0,0 @@
- # Alpha __mpn_lshift --
-
- # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # cnt r19
-
- # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
- # it would take 4 cycles/limb. It should be possible to get down to 3
- # cycles/limb since both ldq and stq can be paired with the other used
- # instructions. But there are many restrictions in the 21064 pipeline that
- # makes it hard, if not impossible, to get down to 3 cycles/limb:
-
- # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
- # 2. Only aligned instruction pairs can be paired.
- # 3. The store buffer or silo might not be able to deal with the bandwidth.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_lshift
- .ent __mpn_lshift
-__mpn_lshift:
- .frame $30,0,$26,0
-
- s8addq $18,$17,$17 # make r17 point at end of s1
- ldq $4,-8($17) # load first limb
- subq $17,8,$17
- subq $31,$19,$7
- s8addq $18,$16,$16 # make r16 point at end of RES
- subq $18,1,$18
- and $18,4-1,$20 # number of limbs in first loop
- srl $4,$7,$0 # compute function result
-
- beq $20,.L0
- subq $18,$20,$18
-
- .align 3
-.Loop0:
- ldq $3,-8($17)
- subq $16,8,$16
- subq $17,8,$17
- subq $20,1,$20
- sll $4,$19,$5
- srl $3,$7,$6
- bis $3,$3,$4
- bis $5,$6,$8
- stq $8,0($16)
- bne $20,.Loop0
-
-.L0: beq $18,.Lend
-
- .align 3
-.Loop: ldq $3,-8($17)
- subq $16,32,$16
- subq $18,4,$18
- sll $4,$19,$5
- srl $3,$7,$6
-
- ldq $4,-16($17)
- sll $3,$19,$1
- bis $5,$6,$8
- stq $8,24($16)
- srl $4,$7,$2
-
- ldq $3,-24($17)
- sll $4,$19,$5
- bis $1,$2,$8
- stq $8,16($16)
- srl $3,$7,$6
-
- ldq $4,-32($17)
- sll $3,$19,$1
- bis $5,$6,$8
- stq $8,8($16)
- srl $4,$7,$2
-
- subq $17,32,$17
- bis $1,$2,$8
- stq $8,0($16)
-
- bgt $18,.Loop
-
-.Lend: sll $4,$19,$8
- stq $8,-8($16)
- ret $31,($26),1
- .end __mpn_lshift
diff --git a/mpn/alpha/mul_1.asm b/mpn/alpha/mul_1.asm
new file mode 100644
index 000000000..94cd55c9c
--- /dev/null
+++ b/mpn/alpha/mul_1.asm
@@ -0,0 +1,71 @@
+dnl Alpha __mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl s2_limb r19
+
+dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ subq r18,1,r18 C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ bic r31,r31,r4 C clear cy_limb
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Lend1 C jump if size was == 1
+ ldq r2,8(r17) C r2 = s1_limb
+ subq r18,1,r18 C size--
+ stq r3,0(r16)
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ subq r18,1,r18 C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,16(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ stq r3,8(r16)
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ addq r16,8,r16 C res_ptr++
+ bne r18,$Loop
+
+$Lend2: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ stq r3,8(r16)
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Lend1: stq r3,0(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff --git a/mpn/alpha/mul_1.s b/mpn/alpha/mul_1.s
deleted file mode 100644
index 470c89368..000000000
--- a/mpn/alpha/mul_1.s
+++ /dev/null
@@ -1,85 +0,0 @@
- # Alpha __mpn_mul_1 -- Multiply a limb vector with a limb and store
- # the result in a second limb vector.
-
- # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # s2_limb r19
-
- # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
-
- # To improve performance for long multiplications, we would use
- # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
- # these instructions without slowing down the general code: 1. We can
- # only have two prefetches in operation at any time in the Alpha
- # architecture. 2. There will seldom be any special alignment
- # between RES_PTR and S1_PTR. Maybe we can simply divide the current
- # loop into an inner and outer loop, having the inner loop handle
- # exactly one prefetch block?
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_mul_1
- .ent __mpn_mul_1 2
-__mpn_mul_1:
- .frame $30,0,$26
-
- ldq $2,0($17) # $2 = s1_limb
- subq $18,1,$18 # size--
- mulq $2,$19,$3 # $3 = prod_low
- bic $31,$31,$4 # clear cy_limb
- umulh $2,$19,$0 # $0 = prod_high
- beq $18,.Lend1 # jump if size was == 1
- ldq $2,8($17) # $2 = s1_limb
- subq $18,1,$18 # size--
- stq $3,0($16)
- beq $18,.Lend2 # jump if size was == 2
-
- .align 3
-.Loop: mulq $2,$19,$3 # $3 = prod_low
- addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
- subq $18,1,$18 # size--
- umulh $2,$19,$4 # $4 = cy_limb
- ldq $2,16($17) # $2 = s1_limb
- addq $17,8,$17 # s1_ptr++
- addq $3,$0,$3 # $3 = cy_limb + prod_low
- stq $3,8($16)
- cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
- addq $16,8,$16 # res_ptr++
- bne $18,.Loop
-
-.Lend2: mulq $2,$19,$3 # $3 = prod_low
- addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
- umulh $2,$19,$4 # $4 = cy_limb
- addq $3,$0,$3 # $3 = cy_limb + prod_low
- cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
- stq $3,8($16)
- addq $4,$0,$0 # cy_limb = prod_high + cy
- ret $31,($26),1
-.Lend1: stq $3,0($16)
- ret $31,($26),1
-
- .end __mpn_mul_1
diff --git a/mpn/alpha/rshift.asm b/mpn/alpha/rshift.asm
new file mode 100644
index 000000000..4c111d237
--- /dev/null
+++ b/mpn/alpha/rshift.asm
@@ -0,0 +1,102 @@
+dnl Alpha mpn_rshift -- Shift a number right.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
+dnl it would take 4 cycles/limb. It should be possible to get down to 3
+dnl cycles/limb since both ldq and stq can be paired with the other used
+dnl instructions. But there are many restrictions in the 21064 pipeline that
+dnl makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl 2. Only aligned instruction pairs can be paired.
+dnl 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ldq r4,0(r17) C load first limb
+ addq r17,8,r17
+ subq r31,r19,r7
+ subq r18,1,r18
+ and r18,4-1,r20 C number of limbs in first loop
+ sll r4,r7,r0 C compute function result
+
+ beq r20,$L0
+ subq r18,r20,r18
+
+ ALIGN(8)
+$Loop0:
+ ldq r3,0(r17)
+ addq r16,8,r16
+ addq r17,8,r17
+ subq r20,1,r20
+ srl r4,r19,r5
+ sll r3,r7,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,-8(r16)
+ bne r20,$Loop0
+
+$L0: beq r18,$Lend
+
+ ALIGN(8)
+$Loop: ldq r3,0(r17)
+ addq r16,32,r16
+ subq r18,4,r18
+ srl r4,r19,r5
+ sll r3,r7,r6
+
+ ldq r4,8(r17)
+ srl r3,r19,r1
+ bis r5,r6,r8
+ stq r8,-32(r16)
+ sll r4,r7,r2
+
+ ldq r3,16(r17)
+ srl r4,r19,r5
+ bis r1,r2,r8
+ stq r8,-24(r16)
+ sll r3,r7,r6
+
+ ldq r4,24(r17)
+ srl r3,r19,r1
+ bis r5,r6,r8
+ stq r8,-16(r16)
+ sll r4,r7,r2
+
+ addq r17,32,r17
+ bis r1,r2,r8
+ stq r8,-8(r16)
+
+ bgt r18,$Loop
+
+$Lend: srl r4,r19,r8
+ stq r8,0(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/mpn/alpha/rshift.s b/mpn/alpha/rshift.s
deleted file mode 100644
index 12a3e369d..000000000
--- a/mpn/alpha/rshift.s
+++ /dev/null
@@ -1,107 +0,0 @@
- # Alpha __mpn_rshift --
-
- # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # cnt r19
-
- # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
- # it would take 4 cycles/limb. It should be possible to get down to 3
- # cycles/limb since both ldq and stq can be paired with the other used
- # instructions. But there are many restrictions in the 21064 pipeline that
- # makes it hard, if not impossible, to get down to 3 cycles/limb:
-
- # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
- # 2. Only aligned instruction pairs can be paired.
- # 3. The store buffer or silo might not be able to deal with the bandwidth.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_rshift
- .ent __mpn_rshift
-__mpn_rshift:
- .frame $30,0,$26,0
-
- ldq $4,0($17) # load first limb
- addq $17,8,$17
- subq $31,$19,$7
- subq $18,1,$18
- and $18,4-1,$20 # number of limbs in first loop
- sll $4,$7,$0 # compute function result
-
- beq $20,.L0
- subq $18,$20,$18
-
- .align 3
-.Loop0:
- ldq $3,0($17)
- addq $16,8,$16
- addq $17,8,$17
- subq $20,1,$20
- srl $4,$19,$5
- sll $3,$7,$6
- bis $3,$3,$4
- bis $5,$6,$8
- stq $8,-8($16)
- bne $20,.Loop0
-
-.L0: beq $18,.Lend
-
- .align 3
-.Loop: ldq $3,0($17)
- addq $16,32,$16
- subq $18,4,$18
- srl $4,$19,$5
- sll $3,$7,$6
-
- ldq $4,8($17)
- srl $3,$19,$1
- bis $5,$6,$8
- stq $8,-32($16)
- sll $4,$7,$2
-
- ldq $3,16($17)
- srl $4,$19,$5
- bis $1,$2,$8
- stq $8,-24($16)
- sll $3,$7,$6
-
- ldq $4,24($17)
- srl $3,$19,$1
- bis $5,$6,$8
- stq $8,-16($16)
- sll $4,$7,$2
-
- addq $17,32,$17
- bis $1,$2,$8
- stq $8,-8($16)
-
- bgt $18,.Loop
-
-.Lend: srl $4,$19,$8
- stq $8,0($16)
- ret $31,($26),1
- .end __mpn_rshift
diff --git a/mpn/alpha/sub_n.asm b/mpn/alpha/sub_n.asm
new file mode 100644
index 000000000..e227af553
--- /dev/null
+++ b/mpn/alpha/sub_n.asm
@@ -0,0 +1,114 @@
+dnl Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl store difference in a third limb vector.
+
+dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ ldq r3,0(r17)
+ ldq r4,0(r18)
+
+ subq r19,1,r19
+ and r19,4-1,r2 C number of limbs in first loop
+ bis r31,r31,r0
+ beq r2,$L0 C if multiple of 4 limbs, skip first loop
+
+ subq r19,r2,r19
+
+$Loop0: subq r2,1,r2
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ addq r17,8,r17
+ addq r18,8,r18
+ bis r5,r5,r3
+ bis r6,r6,r4
+ addq r16,8,r16
+ bne r2,$Loop0
+
+$L0: beq r19,$Lend
+
+ ALIGN(8)
+$Loop: subq r19,4,r19
+
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ ldq r3,16(r17)
+ addq r6,r0,r6
+ ldq r4,16(r18)
+ cmpult r6,r0,r1
+ subq r5,r6,r6
+ cmpult r5,r6,r0
+ stq r6,8(r16)
+ bis r0,r1,r0
+
+ ldq r5,24(r17)
+ addq r4,r0,r4
+ ldq r6,24(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,16(r16)
+ bis r0,r1,r0
+
+ ldq r3,32(r17)
+ addq r6,r0,r6
+ ldq r4,32(r18)
+ cmpult r6,r0,r1
+ subq r5,r6,r6
+ cmpult r5,r6,r0
+ stq r6,24(r16)
+ bis r0,r1,r0
+
+ addq r17,32,r17
+ addq r18,32,r18
+ addq r16,32,r16
+ bne r19,$Loop
+
+$Lend: addq r4,r0,r4
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/mpn/alpha/sub_n.s b/mpn/alpha/sub_n.s
deleted file mode 100644
index 3c90c1169..000000000
--- a/mpn/alpha/sub_n.s
+++ /dev/null
@@ -1,120 +0,0 @@
- # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
- # store difference in a third limb vector.
-
- # Copyright (C) 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
-
- # INPUT PARAMETERS
- # res_ptr $16
- # s1_ptr $17
- # s2_ptr $18
- # size $19
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_sub_n
- .ent __mpn_sub_n
-__mpn_sub_n:
- .frame $30,0,$26,0
-
- ldq $3,0($17)
- ldq $4,0($18)
-
- subq $19,1,$19
- and $19,4-1,$2 # number of limbs in first loop
- bis $31,$31,$0
- beq $2,.L0 # if multiple of 4 limbs, skip first loop
-
- subq $19,$2,$19
-
-.Loop0: subq $2,1,$2
- ldq $5,8($17)
- addq $4,$0,$4
- ldq $6,8($18)
- cmpult $4,$0,$1
- subq $3,$4,$4
- cmpult $3,$4,$0
- stq $4,0($16)
- or $0,$1,$0
-
- addq $17,8,$17
- addq $18,8,$18
- bis $5,$5,$3
- bis $6,$6,$4
- addq $16,8,$16
- bne $2,.Loop0
-
-.L0: beq $19,.Lend
-
- .align 3
-.Loop: subq $19,4,$19
-
- ldq $5,8($17)
- addq $4,$0,$4
- ldq $6,8($18)
- cmpult $4,$0,$1
- subq $3,$4,$4
- cmpult $3,$4,$0
- stq $4,0($16)
- or $0,$1,$0
-
- ldq $3,16($17)
- addq $6,$0,$6
- ldq $4,16($18)
- cmpult $6,$0,$1
- subq $5,$6,$6
- cmpult $5,$6,$0
- stq $6,8($16)
- or $0,$1,$0
-
- ldq $5,24($17)
- addq $4,$0,$4
- ldq $6,24($18)
- cmpult $4,$0,$1
- subq $3,$4,$4
- cmpult $3,$4,$0
- stq $4,16($16)
- or $0,$1,$0
-
- ldq $3,32($17)
- addq $6,$0,$6
- ldq $4,32($18)
- cmpult $6,$0,$1
- subq $5,$6,$6
- cmpult $5,$6,$0
- stq $6,24($16)
- or $0,$1,$0
-
- addq $17,32,$17
- addq $18,32,$18
- addq $16,32,$16
- bne $19,.Loop
-
-.Lend: addq $4,$0,$4
- cmpult $4,$0,$1
- subq $3,$4,$4
- cmpult $3,$4,$0
- stq $4,0($16)
- or $0,$1,$0
- ret $31,($26),1
-
- .end __mpn_sub_n
diff --git a/mpn/alpha/submul_1.asm b/mpn/alpha/submul_1.asm
new file mode 100644
index 000000000..5122d9e80
--- /dev/null
+++ b/mpn/alpha/submul_1.asm
@@ -0,0 +1,87 @@
+dnl Alpha __mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Library General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Library General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl s2_limb r19
+
+dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Lend1 C jump if size was == 1
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ subq r5,r3,r3
+ cmpult r5,r3,r4
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ subq r18,1,r18 C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ addq r5,r0,r0 C combine carries
+ bne r18,$Loop
+
+$Lend2: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r5,r0,r0 C combine carries
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Lend1: subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r0,r5,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_submul_1)
+ASM_END()
diff --git a/mpn/alpha/submul_1.s b/mpn/alpha/submul_1.s
deleted file mode 100644
index 319c10f07..000000000
--- a/mpn/alpha/submul_1.s
+++ /dev/null
@@ -1,92 +0,0 @@
- # Alpha __mpn_submul_1 -- Multiply a limb vector with a limb and
- # subtract the result from a second limb vector.
-
- # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
-
- # This file is part of the GNU MP Library.
-
- # The GNU MP Library is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Library General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or (at your
- # option) any later version.
-
- # The GNU MP Library is distributed in the hope that it will be useful, but
- # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
- # License for more details.
-
- # You should have received a copy of the GNU Library General Public License
- # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
- # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- # MA 02111-1307, USA.
-
-
- # INPUT PARAMETERS
- # res_ptr r16
- # s1_ptr r17
- # size r18
- # s2_limb r19
-
- # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
-
- .set noreorder
- .set noat
-.text
- .align 3
- .globl __mpn_submul_1
- .ent __mpn_submul_1 2
-__mpn_submul_1:
- .frame $30,0,$26
-
- ldq $2,0($17) # $2 = s1_limb
- addq $17,8,$17 # s1_ptr++
- subq $18,1,$18 # size--
- mulq $2,$19,$3 # $3 = prod_low
- ldq $5,0($16) # $5 = *res_ptr
- umulh $2,$19,$0 # $0 = prod_high
- beq $18,.Lend1 # jump if size was == 1
- ldq $2,0($17) # $2 = s1_limb
- addq $17,8,$17 # s1_ptr++
- subq $18,1,$18 # size--
- subq $5,$3,$3
- cmpult $5,$3,$4
- stq $3,0($16)
- addq $16,8,$16 # res_ptr++
- beq $18,.Lend2 # jump if size was == 2
-
- .align 3
-.Loop: mulq $2,$19,$3 # $3 = prod_low
- ldq $5,0($16) # $5 = *res_ptr
- addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
- subq $18,1,$18 # size--
- umulh $2,$19,$4 # $4 = cy_limb
- ldq $2,0($17) # $2 = s1_limb
- addq $17,8,$17 # s1_ptr++
- addq $3,$0,$3 # $3 = cy_limb + prod_low
- cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
- subq $5,$3,$3
- cmpult $5,$3,$5
- stq $3,0($16)
- addq $16,8,$16 # res_ptr++
- addq $5,$0,$0 # combine carries
- bne $18,.Loop
-
-.Lend2: mulq $2,$19,$3 # $3 = prod_low
- ldq $5,0($16) # $5 = *res_ptr
- addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
- umulh $2,$19,$4 # $4 = cy_limb
- addq $3,$0,$3 # $3 = cy_limb + prod_low
- cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
- subq $5,$3,$3
- cmpult $5,$3,$5
- stq $3,0($16)
- addq $5,$0,$0 # combine carries
- addq $4,$0,$0 # cy_limb = prod_high + cy
- ret $31,($26),1
-.Lend1: subq $5,$3,$3
- cmpult $5,$3,$5
- stq $3,0($16)
- addq $0,$5,$0
- ret $31,($26),1
-
- .end __mpn_submul_1
diff --git a/mpn/alpha/unicos.m4 b/mpn/alpha/unicos.m4
new file mode 100644
index 000000000..93d97b7e1
--- /dev/null
+++ b/mpn/alpha/unicos.m4
@@ -0,0 +1,41 @@
+divert(-1)
+
+define(`ASM_START',
+ `.ident dummy')
+
+define(`X',`^X$1')
+define(`INT64',
+ `dnl
+ .psect $1@crud,data
+$1: .quad $2
+ .endp')
+
+define(`PROLOGUE',
+ `dnl
+ .stack 192 ; What does this mean? Only Cray knows.
+ .psect $1@code,code,cache
+$1::')
+define(`PROLOGUE_GP', `PROLOGUE($1)')
+
+define(`EPILOGUE',
+ `dnl
+ .endp')
+
+define(`DATASTART',
+ `dnl
+ .psect $1@crud,data
+$1:')
+define(`DATAEND',
+ `dnl
+ .endp')
+
+define(`ASM_END',
+ `dnl
+ .end')
+
+define(`unop',`bis r31,r31,r31') ; Unicos assembler lacks unop
+
+define(`ALIGN',`') ; Unicos assembler seems to align using garbage
+
+divert
+