summaryrefslogtreecommitdiff
path: root/rts/gmp/mpn/alpha
diff options
context:
space:
mode:
Diffstat (limited to 'rts/gmp/mpn/alpha')
-rw-r--r--rts/gmp/mpn/alpha/README224
-rw-r--r--rts/gmp/mpn/alpha/add_n.asm114
-rw-r--r--rts/gmp/mpn/alpha/addmul_1.asm87
-rw-r--r--rts/gmp/mpn/alpha/cntlz.asm68
-rw-r--r--rts/gmp/mpn/alpha/default.m477
-rw-r--r--rts/gmp/mpn/alpha/ev5/add_n.asm143
-rw-r--r--rts/gmp/mpn/alpha/ev5/lshift.asm169
-rw-r--r--rts/gmp/mpn/alpha/ev5/rshift.asm167
-rw-r--r--rts/gmp/mpn/alpha/ev5/sub_n.asm143
-rw-r--r--rts/gmp/mpn/alpha/ev6/addmul_1.asm474
-rw-r--r--rts/gmp/mpn/alpha/ev6/gmp-mparam.h62
-rw-r--r--rts/gmp/mpn/alpha/gmp-mparam.h64
-rw-r--r--rts/gmp/mpn/alpha/invert_limb.asm345
-rw-r--r--rts/gmp/mpn/alpha/lshift.asm104
-rw-r--r--rts/gmp/mpn/alpha/mul_1.asm71
-rw-r--r--rts/gmp/mpn/alpha/rshift.asm102
-rw-r--r--rts/gmp/mpn/alpha/sub_n.asm114
-rw-r--r--rts/gmp/mpn/alpha/submul_1.asm87
-rw-r--r--rts/gmp/mpn/alpha/udiv_qrnnd.S151
-rw-r--r--rts/gmp/mpn/alpha/umul.asm39
-rw-r--r--rts/gmp/mpn/alpha/unicos.m463
21 files changed, 2868 insertions, 0 deletions
diff --git a/rts/gmp/mpn/alpha/README b/rts/gmp/mpn/alpha/README
new file mode 100644
index 0000000000..744260c7c5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/README
@@ -0,0 +1,224 @@
+This directory contains mpn functions optimized for DEC Alpha processors.
+
+ALPHA ASSEMBLY RULES AND REGULATIONS
+
+The `.prologue N' pseudo op marks the end of instruction that needs
+special handling by unwinding. It also says whether $27 is really
+needed for computing the gp. The `.mask M' pseudo op says which
+registers are saved on the stack, and at what offset in the frame.
+
+Cray code is very very different...
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+EV4
+
+1. This chip has very limited store bandwidth. The on-chip L1 cache is
+ write-through, and a cache line is transfered from the store buffer to
+ the off-chip L2 in as much 15 cycles on most systems. This delay hurts
+ mpn_add_n, mpn_sub_n, mpn_lshift, and mpn_rshift.
+
+2. Pairing is possible between memory instructions and integer arithmetic
+ instructions.
+
+3. mulq and umulh are documented to have a latency of 23 cycles, but 2 of
+ these cycles are pipelined. Thus, multiply instructions can be issued at
+ a rate of one each 21st cycle.
+
+EV5
+
+1. The memory bandwidth of this chip seems excellent, both for loads and
+ stores. Even when the working set is larger than the on-chip L1 and L2
+ caches, the performance remain almost unaffected.
+
+2. mulq has a latency of 12 cycles and an issue rate of 1 each 8th cycle.
+ umulh has a measured latency of 14 cycles and an issue rate of 1 each
+ 10th cycle. But the exact timing is somewhat confusing.
+
+3. mpn_add_n. With 4-fold unrolling, we need 37 instructions, whereof 12
+ are memory operations. This will take at least
+ ceil(37/2) [dual issue] + 1 [taken branch] = 19 cycles
+ We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data
+ cache cycles, which should be completely hidden in the 19 issue cycles.
+ The computation is inherently serial, with these dependencies:
+
+ ldq ldq
+ \ /\
+ (or) addq |
+ |\ / \ |
+ | addq cmpult
+ \ | |
+ cmpult |
+ \ /
+ or
+
+ I.e., 3 operations are needed between carry-in and carry-out, making 12
+ cycles the absolute minimum for the 4 limbs. We could replace the `or'
+ with a cmoveq/cmovne, which could issue one cycle earlier that the `or',
+ but that might waste a cycle on EV4. The total depth remain unaffected,
+ since cmov has a latency of 2 cycles.
+
+ addq
+ / \
+ addq cmpult
+ | \
+ cmpult -> cmovne
+
+Montgomery has a slightly different way of computing carry that requires one
+less instruction, but has depth 4 (instead of the current 3). Since the
+code is currently instruction issue bound, Montgomery's idea should save us
+1/2 cycle per limb, or bring us down to a total of 17 cycles or 4.25
+cycles/limb. Unfortunately, this method will not be good for the EV6.
+
+EV6
+
+Here we have a really parallel pipeline, capable of issuing up to 4 integer
+instructions per cycle. One integer multiply instruction can issue each
+cycle. To get optimal speed, we need to pretend we are vectorizing the code,
+i.e., minimize the iterative dependencies.
+
+There are two dependencies to watch out for. 1) Address arithmetic
+dependencies, and 2) carry propagation dependencies.
+
+We can avoid serializing due to address arithmetic by unrolling the loop, so
+that addresses don't depend heavily on an index variable. Avoiding
+serializing because of carry propagation is trickier; the ultimate performance
+of the code will be determined of the number of latency cycles it takes from
+accepting carry-in to a vector point until we can generate carry-out.
+
+Most integer instructions can execute in either the L0, U0, L1, or U1
+pipelines. Shifts only execute in U0 and U1, and multiply only in U1.
+
+CMOV instructions split into two internal instructions, CMOV1 and CMOV2, but
+the execute efficiently. But CMOV split the mapping process (see pg 2-26 in
+cmpwrgd.pdf), suggesting the CMOV should always be placed as the last
+instruction of an aligned 4 instruction block (?).
+
+Perhaps the most important issue is the latency between the L0/U0 and L1/U1
+clusters; a result obtained on either cluster has an extra cycle of latency
+for consumers in the opposite cluster. Because of the dynamic nature of the
+implementation, it is hard to predict where an instruction will execute.
+
+The shift loops need (per limb):
+ 1 load (Lx pipes)
+ 1 store (Lx pipes)
+ 2 shift (Ux pipes)
+ 1 iaddlog (Lx pipes, Ux pipes)
+Obviously, since the pipes are very equally loaded, we should get 4 insn/cycle, or 1.25 cycles/limb.
+
+For mpn_add_n, we currently have
+ 2 load (Lx pipes)
+ 1 store (Lx pipes)
+ 5 iaddlog (Lx pipes, Ux pipes)
+
+Again, we have a perfect balance and will be limited by carry propagation
+delays, currently three cycles. The superoptimizer indicates that ther
+might be sequences that--using a final cmov--have a carry propagation delay
+of just two. Montgomery's subtraction sequence could perhaps be used, by
+complementing some operands. All in all, we should get down to 2 cycles
+without much problems.
+
+For mpn_mul_1, we could do, just like for mpn_add_n:
+ not newlo,notnewlo
+ addq cylimb,newlo,newlo || cmpult cylimb,notnewlo,cyout
+ addq cyout,newhi,cylimb
+and get 2-cycle carry propagation. The instructions needed will be
+ 1 ld (Lx pipes)
+ 1 st (Lx pipes)
+ 2 mul (U1 pipe)
+ 4 iaddlog (Lx pipes, Ux pipes)
+issue1: addq not mul ld
+issue2: cmpult addq mul st
+Conclusion: no cluster delays and 2-cycle carry delays will give us 2 cycles/limb!
+
+Last, we have mpn_addmul_1. Almost certainly, we will get down to 3
+cycles/limb, which would be absolutely awesome.
+
+Old, perhaps obsolete addmul_1 dependency diagram (needs 175 columns wide screen):
+
+ i
+ s
+ s i
+ u n
+ e s
+ d t
+ r
+ i u
+l n c
+i s t
+v t i
+e r o
+ u n
+v c
+a t t
+l i y
+u o p
+e n e
+s s s
+ issue
+ in
+ cycle
+ -1 ldq
+ / \
+ 0 | \
+ | \
+ 1 | |
+ | |
+ 2 | | ldq
+ | | / \
+ 3 | mulq | \
+ | \ | \
+ 4 umulh \ | |
+ | | | |
+ 5 | | | | ldq
+ | | | | / \
+ 4calm 6 | | ldq | mulq | \
+ | | / | \ | \
+ 4casm 7 | | / umulh \ | |
+6 | || | | | |
+ 3aal 8 | || | | | | ldq
+7 | || | | | | / \
+ 4calm 9 | || | | ldq | mulq | \
+9 | || | | / | \ | \
+ 4casm 10 | || | | / umulh \ | |
+9 | || | || | | | |
+ 3aal 11 | addq | || | | | | ldq
+9 | // \ | || | | | | / \
+ 4calm 12 \ cmpult addq<-cy | || | | ldq | mulq | \
+13 \ / // \ | || | | / | \ | \
+ 4casm 13 addq cmpult stq | || | | / umulh \ | |
+11 \ / | || | || | | | |
+ 3aal 14 addq | addq | || | | | | ldq
+10 \ | // \ | || | | | | / \
+ 4calm 15 cy ----> \ cmpult addq<-cy | || | | ldq | mulq | \
+13 \ / // \ | || | | / | \ | \
+ 4casm 16 addq cmpult stq | || | | / umulh \ | |
+11 \ / | || | || | | | |
+ 3aal 17 addq | addq | || | | | |
+10 \ | // \ | || | | | |
+ 4calm 18 cy ----> \ cmpult addq<-cy | || | | ldq | mulq
+13 \ / // \ | || | | / | \
+ 4casm 19 addq cmpult stq | || | | / umulh \
+11 \ / | || | || | |
+ 3aal 20 addq | addq | || | |
+10 \ | // \ | || | |
+ 4calm 21 cy ----> \ cmpult addq<-cy | || | | ldq
+ \ / // \ | || | | /
+ 22 addq cmpult stq | || | | /
+ \ / | || | ||
+ 23 addq | addq | ||
+ \ | // \ | ||
+ 24 cy ----> \ cmpult addq<-cy | ||
+ \ / // \ | ||
+ 25 addq cmpult stq | ||
+ \ / | ||
+ 26 addq | addq
+ \ | // \
+ 27 cy ----> \ cmpult addq<-cy
+ \ / // \
+ 28 addq cmpult stq
+ \ /
+As many as 6 consecutive points will be under execution simultaneously, or if we addq
+schedule loads even further away, maybe 7 or 8. But the number of live quantities \
+is reasonable, and can easily be satisfied. cy ---->
diff --git a/rts/gmp/mpn/alpha/add_n.asm b/rts/gmp/mpn/alpha/add_n.asm
new file mode 100644
index 0000000000..08d6a9f7b8
--- /dev/null
+++ b/rts/gmp/mpn/alpha/add_n.asm
@@ -0,0 +1,114 @@
+dnl Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ ldq r3,0(r17)
+ ldq r4,0(r18)
+
+ subq r19,1,r19
+ and r19,4-1,r2 C number of limbs in first loop
+ bis r31,r31,r0
+ beq r2,$L0 C if multiple of 4 limbs, skip first loop
+
+ subq r19,r2,r19
+
+$Loop0: subq r2,1,r2
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ addq r17,8,r17
+ addq r18,8,r18
+ bis r5,r5,r3
+ bis r6,r6,r4
+ addq r16,8,r16
+ bne r2,$Loop0
+
+$L0: beq r19,$Lend
+
+ ALIGN(8)
+$Loop: subq r19,4,r19
+
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ ldq r3,16(r17)
+ addq r6,r0,r6
+ ldq r4,16(r18)
+ cmpult r6,r0,r1
+ addq r5,r6,r6
+ cmpult r6,r5,r0
+ stq r6,8(r16)
+ bis r0,r1,r0
+
+ ldq r5,24(r17)
+ addq r4,r0,r4
+ ldq r6,24(r18)
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,16(r16)
+ bis r0,r1,r0
+
+ ldq r3,32(r17)
+ addq r6,r0,r6
+ ldq r4,32(r18)
+ cmpult r6,r0,r1
+ addq r5,r6,r6
+ cmpult r6,r5,r0
+ stq r6,24(r16)
+ bis r0,r1,r0
+
+ addq r17,32,r17
+ addq r18,32,r18
+ addq r16,32,r16
+ bne r19,$Loop
+
+$Lend: addq r4,r0,r4
+ cmpult r4,r0,r1
+ addq r3,r4,r4
+ cmpult r4,r3,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/addmul_1.asm b/rts/gmp/mpn/alpha/addmul_1.asm
new file mode 100644
index 0000000000..4ea900be6b
--- /dev/null
+++ b/rts/gmp/mpn/alpha/addmul_1.asm
@@ -0,0 +1,87 @@
+dnl Alpha __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl s2_limb r19
+
+dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Lend1 C jump if size was == 1
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ addq r5,r3,r3
+ cmpult r3,r5,r4
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ subq r18,1,r18 C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ addq r5,r0,r0 C combine carries
+ bne r18,$Loop
+
+$Lend2: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r5,r0,r0 C combine carries
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Lend1: addq r5,r3,r3
+ cmpult r3,r5,r5
+ stq r3,0(r16)
+ addq r0,r5,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/cntlz.asm b/rts/gmp/mpn/alpha/cntlz.asm
new file mode 100644
index 0000000000..febb3b70d9
--- /dev/null
+++ b/rts/gmp/mpn/alpha/cntlz.asm
@@ -0,0 +1,68 @@
+dnl Alpha auxiliary for longlong.h's count_leading_zeros
+
+dnl Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl DISCUSSION:
+
+dnl Other methods have been tried, and using a 128-entry table actually trims
+dnl about 10% of the execution time (on a 21164) when the table is in the L1
+dnl cache. But under non-benchmarking conditions, the table will hardly be in
+dnl the L1 cache. Tricky bit-fiddling methods with multiplies and magic tables
+dnl are also possible, but they require many more instructions than the current
+dnl code. (But for count_trailing_zeros, such tricks are beneficial.)
+dnl Finally, converting to floating-point and extracting the exponent is much
+dnl slower.
+
+ASM_START()
+PROLOGUE(MPN(count_leading_zeros))
+ bis r31,63,r0 C initialize partial result count
+
+ srl r16,32,r1 C shift down 32 steps -> r1
+ cmovne r1,r1,r16 C select r1 if non-zero
+ cmovne r1,31,r0 C if r1 is nonzero choose smaller count
+
+ srl r16,16,r1 C shift down 16 steps -> r1
+ subq r0,16,r2 C generate new partial result count
+ cmovne r1,r1,r16 C choose new r1 if non-zero
+ cmovne r1,r2,r0 C choose new count if r1 was non-zero
+
+ srl r16,8,r1
+ subq r0,8,r2
+ cmovne r1,r1,r16
+ cmovne r1,r2,r0
+
+ srl r16,4,r1
+ subq r0,4,r2
+ cmovne r1,r1,r16
+ cmovne r1,r2,r0
+
+ srl r16,2,r1
+ subq r0,2,r2
+ cmovne r1,r1,r16
+ cmovne r1,r2,r0
+
+ srl r16,1,r1 C extract bit 1
+ subq r0,r1,r0 C subtract it from partial result
+
+ ret r31,(r26),1
+EPILOGUE(MPN(count_leading_zeros))
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/default.m4 b/rts/gmp/mpn/alpha/default.m4
new file mode 100644
index 0000000000..5f4c48dc73
--- /dev/null
+++ b/rts/gmp/mpn/alpha/default.m4
@@ -0,0 +1,77 @@
+divert(-1)
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+define(`ASM_START',
+ `
+ .set noreorder
+ .set noat')
+
+define(`X',`0x$1')
+define(`FLOAT64',
+ `
+ .align 3
+$1: .t_floating $2')
+
+define(`PROLOGUE',
+ `
+ .text
+ .align 3
+ .globl $1
+ .ent $1
+$1:
+ .frame r30,0,r26
+ .prologue 0')
+
+define(`PROLOGUE_GP',
+ `
+ .text
+ .align 3
+ .globl $1
+ .ent $1
+$1:
+ ldgp r29,0(r27)
+ .frame r30,0,r26
+ .prologue 1')
+
+define(`EPILOGUE',
+ `
+ .end $1')
+
+dnl Map register names r0, r1, etc, to `$0', `$1', etc.
+dnl This is needed on all systems but Unicos
+forloop(i,0,31,
+`define(`r'i,``$''i)'
+)
+forloop(i,0,31,
+`define(`f'i,``$f''i)'
+)
+
+define(`DATASTART',
+ `dnl
+ DATA
+$1:')
+define(`DATAEND',`dnl')
+
+define(`ASM_END',`dnl')
+
+divert
diff --git a/rts/gmp/mpn/alpha/ev5/add_n.asm b/rts/gmp/mpn/alpha/ev5/add_n.asm
new file mode 100644
index 0000000000..716d6404ae
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/add_n.asm
@@ -0,0 +1,143 @@
+dnl Alpha EV5 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ bis r31,r31,r25 C clear cy
+ subq r19,4,r19 C decr loop cnt
+ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ ldq r1,8(r18)
+ ldq r5,8(r17)
+ addq r17,32,r17 C update s1_ptr
+ ldq r2,16(r18)
+ addq r0,r4,r20 C 1st main add
+ ldq r3,24(r18)
+ subq r19,4,r19 C decr loop cnt
+ ldq r6,-16(r17)
+ cmpult r20,r0,r25 C compute cy from last add
+ ldq r7,-8(r17)
+ addq r1,r5,r28 C 2nd main add
+ addq r18,32,r18 C update s2_ptr
+ addq r28,r25,r21 C 2nd carry add
+ cmpult r28,r5,r8 C compute cy from last add
+ blt r19,$Lend1 C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+ ALIGN(16)
+$Loop: cmpult r21,r28,r25 C compute cy from last add
+ ldq r0,0(r18)
+ bis r8,r25,r25 C combine cy from the two adds
+ ldq r1,8(r18)
+ addq r2,r6,r28 C 3rd main add
+ ldq r4,0(r17)
+ addq r28,r25,r22 C 3rd carry add
+ ldq r5,8(r17)
+ cmpult r28,r6,r8 C compute cy from last add
+ cmpult r22,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+ stq r21,8(r16)
+ addq r3,r7,r28 C 4th main add
+ addq r28,r25,r23 C 4th carry add
+ cmpult r28,r7,r8 C compute cy from last add
+ cmpult r23,r28,r25 C compute cy from last add
+ addq r17,32,r17 C update s1_ptr
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,32,r16 C update res_ptr
+ addq r0,r4,r28 C 1st main add
+ ldq r2,16(r18)
+ addq r25,r28,r20 C 1st carry add
+ ldq r3,24(r18)
+ cmpult r28,r4,r8 C compute cy from last add
+ ldq r6,-16(r17)
+ cmpult r20,r28,r25 C compute cy from last add
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two adds
+ subq r19,4,r19 C decr loop cnt
+ stq r22,-16(r16)
+ addq r1,r5,r28 C 2nd main add
+ stq r23,-8(r16)
+ addq r25,r28,r21 C 2nd carry add
+ addq r18,32,r18 C update s2_ptr
+ cmpult r28,r5,r8 C compute cy from last add
+ bge r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1: cmpult r21,r28,r25 C compute cy from last add
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r2,r6,r28 C 3rd main add
+ addq r28,r25,r22 C 3rd carry add
+ cmpult r28,r6,r8 C compute cy from last add
+ cmpult r22,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+ stq r21,8(r16)
+ addq r3,r7,r28 C 4th main add
+ addq r28,r25,r23 C 4th carry add
+ cmpult r28,r7,r8 C compute cy from last add
+ cmpult r23,r28,r25 C compute cy from last add
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,32,r16 C update res_ptr
+ stq r22,-16(r16)
+ stq r23,-8(r16)
+$Lend2: addq r19,4,r19 C restore loop cnt
+ beq r19,$Lret
+C Start software pipeline for 2nd loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ subq r19,1,r19
+ beq r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+ ALIGN(16)
+$Loop0: addq r0,r4,r28 C main add
+ ldq r0,8(r18)
+ cmpult r28,r4,r8 C compute cy from last add
+ ldq r4,8(r17)
+ addq r28,r25,r20 C carry add
+ addq r18,8,r18
+ addq r17,8,r17
+ stq r20,0(r16)
+ cmpult r20,r28,r25 C compute cy from last add
+ subq r19,1,r19 C decr loop cnt
+ bis r8,r25,r25 C combine cy from the two adds
+ addq r16,8,r16
+ bne r19,$Loop0
+$Lend0: addq r0,r4,r28 C main add
+ addq r28,r25,r20 C carry add
+ cmpult r28,r4,r8 C compute cy from last add
+ cmpult r20,r28,r25 C compute cy from last add
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two adds
+
+$Lret: bis r25,r31,r0 C return cy
+ ret r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/lshift.asm b/rts/gmp/mpn/alpha/ev5/lshift.asm
new file mode 100644
index 0000000000..cb181dda66
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/lshift.asm
@@ -0,0 +1,169 @@
+dnl Alpha EV5 __gmpn_lshift -- Shift a number left.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ s8addq r18,r17,r17 C make r17 point at end of s1
+ ldq r4,-8(r17) C load first limb
+ subq r31,r19,r20
+ s8addq r18,r16,r16 C make r16 point at end of RES
+ subq r18,1,r18
+ and r18,4-1,r28 C number of limbs in first loop
+ srl r4,r20,r0 C compute function result
+
+ beq r28,$L0
+ subq r18,r28,r18
+
+ ALIGN(8)
+$Loop0: ldq r3,-16(r17)
+ subq r16,8,r16
+ sll r4,r19,r5
+ subq r17,8,r17
+ subq r28,1,r28
+ srl r3,r20,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,0(r16)
+ bne r28,$Loop0
+
+$L0: sll r4,r19,r24
+ beq r18,$Lend
+C warm up phase 1
+ ldq r1,-16(r17)
+ subq r18,4,r18
+ ldq r2,-24(r17)
+ ldq r3,-32(r17)
+ ldq r4,-40(r17)
+ beq r18,$Lend1
+C warm up phase 2
+ srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ ldq r1,-48(r17)
+ sll r2,r19,r22
+ ldq r2,-56(r17)
+ srl r3,r20,r5
+ bis r7,r24,r7
+ sll r3,r19,r23
+ bis r8,r21,r8
+ srl r4,r20,r6
+ ldq r3,-64(r17)
+ sll r4,r19,r24
+ ldq r4,-72(r17)
+ subq r18,4,r18
+ beq r18,$Lend2
+ ALIGN(16)
+C main loop
+$Loop: stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+
+ srl r1,r20,r7
+ subq r18,4,r18
+ sll r1,r19,r21
+ unop C ldq r31,-96(r17)
+
+ srl r2,r20,r8
+ ldq r1,-80(r17)
+ sll r2,r19,r22
+ ldq r2,-88(r17)
+
+ stq r5,-24(r16)
+ bis r7,r24,r7
+ stq r6,-32(r16)
+ bis r8,r21,r8
+
+ srl r3,r20,r5
+ unop C ldq r31,-96(r17)
+ sll r3,r19,r23
+ subq r16,32,r16
+
+ srl r4,r20,r6
+ ldq r3,-96(r17)
+ sll r4,r19,r24
+ ldq r4,-104(r17)
+
+ subq r17,32,r17
+ bne r18,$Loop
+C cool down phase 2/1
+$Lend2: stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+ srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ sll r2,r19,r22
+ stq r5,-24(r16)
+ bis r7,r24,r7
+ stq r6,-32(r16)
+ bis r8,r21,r8
+ srl r3,r20,r5
+ sll r3,r19,r23
+ srl r4,r20,r6
+ sll r4,r19,r24
+C cool down phase 2/2
+ stq r7,-40(r16)
+ bis r5,r22,r5
+ stq r8,-48(r16)
+ bis r6,r23,r6
+ stq r5,-56(r16)
+ stq r6,-64(r16)
+C cool down phase 2/3
+ stq r24,-72(r16)
+ ret r31,(r26),1
+
+C cool down phase 1/1
+$Lend1: srl r1,r20,r7
+ sll r1,r19,r21
+ srl r2,r20,r8
+ sll r2,r19,r22
+ srl r3,r20,r5
+ bis r7,r24,r7
+ sll r3,r19,r23
+ bis r8,r21,r8
+ srl r4,r20,r6
+ sll r4,r19,r24
+C cool down phase 1/2
+ stq r7,-8(r16)
+ bis r5,r22,r5
+ stq r8,-16(r16)
+ bis r6,r23,r6
+ stq r5,-24(r16)
+ stq r6,-32(r16)
+ stq r24,-40(r16)
+ ret r31,(r26),1
+
+$Lend: stq r24,-8(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/rshift.asm b/rts/gmp/mpn/alpha/ev5/rshift.asm
new file mode 100644
index 0000000000..9940d83fad
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/rshift.asm
@@ -0,0 +1,167 @@
+dnl Alpha EV5 __gmpn_rshift -- Shift a number right.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ldq r4,0(r17) C load first limb
+ subq r31,r19,r20
+ subq r18,1,r18
+ and r18,4-1,r28 C number of limbs in first loop
+ sll r4,r20,r0 C compute function result
+
+ beq r28,$L0
+ subq r18,r28,r18
+
+ ALIGN(8)
+$Loop0: ldq r3,8(r17)
+ addq r16,8,r16
+ srl r4,r19,r5
+ addq r17,8,r17
+ subq r28,1,r28
+ sll r3,r20,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,-8(r16)
+ bne r28,$Loop0
+
+$L0: srl r4,r19,r24
+ beq r18,$Lend
+C warm up phase 1
+ ldq r1,8(r17)
+ subq r18,4,r18
+ ldq r2,16(r17)
+ ldq r3,24(r17)
+ ldq r4,32(r17)
+ beq r18,$Lend1
+C warm up phase 2
+ sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ ldq r1,40(r17)
+ srl r2,r19,r22
+ ldq r2,48(r17)
+ sll r3,r20,r5
+ bis r7,r24,r7
+ srl r3,r19,r23
+ bis r8,r21,r8
+ sll r4,r20,r6
+ ldq r3,56(r17)
+ srl r4,r19,r24
+ ldq r4,64(r17)
+ subq r18,4,r18
+ beq r18,$Lend2
+ ALIGN(16)
+C main loop
+$Loop: stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+
+ sll r1,r20,r7
+ subq r18,4,r18
+ srl r1,r19,r21
+ unop C ldq r31,-96(r17)
+
+ sll r2,r20,r8
+ ldq r1,72(r17)
+ srl r2,r19,r22
+ ldq r2,80(r17)
+
+ stq r5,16(r16)
+ bis r7,r24,r7
+ stq r6,24(r16)
+ bis r8,r21,r8
+
+ sll r3,r20,r5
+ unop C ldq r31,-96(r17)
+ srl r3,r19,r23
+ addq r16,32,r16
+
+ sll r4,r20,r6
+ ldq r3,88(r17)
+ srl r4,r19,r24
+ ldq r4,96(r17)
+
+ addq r17,32,r17
+ bne r18,$Loop
+C cool down phase 2/1
+$Lend2: stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+ sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ srl r2,r19,r22
+ stq r5,16(r16)
+ bis r7,r24,r7
+ stq r6,24(r16)
+ bis r8,r21,r8
+ sll r3,r20,r5
+ srl r3,r19,r23
+ sll r4,r20,r6
+ srl r4,r19,r24
+C cool down phase 2/2
+ stq r7,32(r16)
+ bis r5,r22,r5
+ stq r8,40(r16)
+ bis r6,r23,r6
+ stq r5,48(r16)
+ stq r6,56(r16)
+C cool down phase 2/3
+ stq r24,64(r16)
+ ret r31,(r26),1
+
+C cool down phase 1/1
+$Lend1: sll r1,r20,r7
+ srl r1,r19,r21
+ sll r2,r20,r8
+ srl r2,r19,r22
+ sll r3,r20,r5
+ bis r7,r24,r7
+ srl r3,r19,r23
+ bis r8,r21,r8
+ sll r4,r20,r6
+ srl r4,r19,r24
+C cool down phase 1/2
+ stq r7,0(r16)
+ bis r5,r22,r5
+ stq r8,8(r16)
+ bis r6,r23,r6
+ stq r5,16(r16)
+ stq r6,24(r16)
+ stq r24,32(r16)
+ ret r31,(r26),1
+
+$Lend: stq r24,0(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/sub_n.asm b/rts/gmp/mpn/alpha/ev5/sub_n.asm
new file mode 100644
index 0000000000..5248a2aa38
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/sub_n.asm
@@ -0,0 +1,143 @@
+dnl Alpha EV5 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ bis r31,r31,r25 C clear cy
+ subq r19,4,r19 C decr loop cnt
+ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ ldq r1,8(r18)
+ ldq r5,8(r17)
+ addq r17,32,r17 C update s1_ptr
+ ldq r2,16(r18)
+ subq r4,r0,r20 C 1st main subtract
+ ldq r3,24(r18)
+ subq r19,4,r19 C decr loop cnt
+ ldq r6,-16(r17)
+ cmpult r4,r0,r25 C compute cy from last subtract
+ ldq r7,-8(r17)
+ subq r5,r1,r28 C 2nd main subtract
+ addq r18,32,r18 C update s2_ptr
+ subq r28,r25,r21 C 2nd carry subtract
+ cmpult r5,r1,r8 C compute cy from last subtract
+ blt r19,$Lend1 C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+ ALIGN(16)
+$Loop: cmpult r28,r25,r25 C compute cy from last subtract
+ ldq r0,0(r18)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ ldq r1,8(r18)
+ subq r6,r2,r28 C 3rd main subtract
+ ldq r4,0(r17)
+ subq r28,r25,r22 C 3rd carry subtract
+ ldq r5,8(r17)
+ cmpult r6,r2,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ stq r21,8(r16)
+ subq r7,r3,r28 C 4th main subtract
+ subq r28,r25,r23 C 4th carry subtract
+ cmpult r7,r3,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ addq r17,32,r17 C update s1_ptr
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,32,r16 C update res_ptr
+ subq r4,r0,r28 C 1st main subtract
+ ldq r2,16(r18)
+ subq r28,r25,r20 C 1st carry subtract
+ ldq r3,24(r18)
+ cmpult r4,r0,r8 C compute cy from last subtract
+ ldq r6,-16(r17)
+ cmpult r28,r25,r25 C compute cy from last subtract
+ ldq r7,-8(r17)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ subq r19,4,r19 C decr loop cnt
+ stq r22,-16(r16)
+ subq r5,r1,r28 C 2nd main subtract
+ stq r23,-8(r16)
+ subq r28,r25,r21 C 2nd carry subtract
+ addq r18,32,r18 C update s2_ptr
+ cmpult r5,r1,r8 C compute cy from last subtract
+ bge r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1: cmpult r28,r25,r25 C compute cy from last subtract
+ bis r8,r25,r25 C combine cy from the two subtracts
+ subq r6,r2,r28 C cy add
+ subq r28,r25,r22 C 3rd main subtract
+ cmpult r6,r2,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+ stq r21,8(r16)
+ subq r7,r3,r28 C cy add
+ subq r28,r25,r23 C 4th main subtract
+ cmpult r7,r3,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,32,r16 C update res_ptr
+ stq r22,-16(r16)
+ stq r23,-8(r16)
+$Lend2: addq r19,4,r19 C restore loop cnt
+ beq r19,$Lret
+C Start software pipeline for 2nd loop
+ ldq r0,0(r18)
+ ldq r4,0(r17)
+ subq r19,1,r19
+ beq r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+ ALIGN(16)
+$Loop0: subq r4,r0,r28 C main subtract
+ cmpult r4,r0,r8 C compute cy from last subtract
+ ldq r0,8(r18)
+ ldq r4,8(r17)
+ subq r28,r25,r20 C carry subtract
+ addq r18,8,r18
+ addq r17,8,r17
+ stq r20,0(r16)
+ cmpult r28,r25,r25 C compute cy from last subtract
+ subq r19,1,r19 C decr loop cnt
+ bis r8,r25,r25 C combine cy from the two subtracts
+ addq r16,8,r16
+ bne r19,$Loop0
+$Lend0: subq r4,r0,r28 C main subtract
+ subq r28,r25,r20 C carry subtract
+ cmpult r4,r0,r8 C compute cy from last subtract
+ cmpult r28,r25,r25 C compute cy from last subtract
+ stq r20,0(r16)
+ bis r8,r25,r25 C combine cy from the two subtracts
+
+$Lret: bis r25,r31,r0 C return cy
+ ret r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev6/addmul_1.asm b/rts/gmp/mpn/alpha/ev6/addmul_1.asm
new file mode 100644
index 0000000000..2f588626a5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev6/addmul_1.asm
@@ -0,0 +1,474 @@
+dnl Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl s2_limb r19
+
+dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
+dnl exactly 3.625 cycles/limb on EV6...
+
+dnl This code was written in close cooperation with ev6 pipeline expert
+dnl Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though.
+dnl
+dnl Register usages for unrolled loop:
+dnl 0-3 mul's
+dnl 4-7 acc's
+dnl 8-15 mul results
+dnl 20,21 carry's
+dnl 22,23 save for stores
+
+dnl Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
+
+dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl them, so that further disturbance to the schedule is damped.
+
+dnl We couldn't pair the loads, because the entangled schedule of the
+dnl carry's has to happen on one side {0} of the machine. Note, the total
+dnl use of U0, and the total use of L0 (after attending to the stores).
+dnl which is part of the reason why....
+
+dnl This is a great schedule for the d_cache, a poor schedule for the
+dnl b_cache. The lockup on U0 means that any stall can't be recovered
+dnl from. Consider a ldq in L1. say that load gets stalled because it
+dnl collides with a fill from the b_Cache. On the next cycle, this load
+dnl gets priority. If first looks at L0, and goes there. The instruction
+dnl we intended for L0 gets to look at L1, which is NOT where we want
+dnl it. It either stalls 1, because it can't go in L0, or goes there, and
+dnl causes a further instruction to stall.
+
+dnl So for b_cache, we're likely going to want to put one or more cycles
+dnl back into the code! And, of course, put in prefetches. For the
+dnl accumulator, lds, intent to modify. For the multiplier, you might
+dnl want ldq, evict next, if you're not wanting to use it again soon. Use
+dnl 256 ahead of present pointer value. At a place where we have an mt
+dnl followed by a bookkeeping, put the bookkeeping in upper, and the
+dnl prefetch into lower.
+
+dnl Note, the usage of physical registers per cycle is smoothed off, as
+dnl much as possible.
+
+dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
+dnl like not to have a ldq or stq to preceded a conditional branch in a
+dnl quadpack. The conditional branch moves the retire pointer one cycle
+dnl later.
+
+dnl Optimization notes:
+dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
+dnl Reserved regs: r29 r30 r31
+dnl Free caller-saves regs in unrolled code: r24 r25 r28
+dnl We should swap some of the callee-saves regs for some of the free
+dnl caller-saves regs, saving some overhead cycles.
+dnl Most importantly, we should write fast code for the 0-7 case.
+dnl The code we use there are for the 21164, and runs at 7 cycles/limb
+dnl on the 21264. Should not be hard, if we write specialized code for
+dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just
+dnl need a jump table indexed by the low 3 bits of the count argument.
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ cmpult r18, 8, r1
+ beq r1, $Large
+
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ subq r18, 1, r18 C size--
+ mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ umulh r2, r19, r0 C r0 = prod_high
+ beq r18, $Lend0b C jump if size was == 1
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ subq r18, 1, r18 C size--
+ addq r5, r3, r3
+ cmpult r3, r5, r4
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ beq r18, $Lend0a C jump if size was == 2
+
+ ALIGN(8)
+$Loop0: mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
+ subq r18, 1, r18 C size--
+ umulh r2, r19, r4 C r4 = cy_limb
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ addq r3, r0, r3 C r3 = cy_limb + prod_low
+ cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ addq r5, r0, r0 C combine carries
+ bne r18, $Loop0
+$Lend0a:
+ mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
+ umulh r2, r19, r4 C r4 = cy_limb
+ addq r3, r0, r3 C r3 = cy_limb + prod_low
+ cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r5, r0, r0 C combine carries
+ addq r4, r0, r0 C cy_limb = prod_high + cy
+ ret r31, (r26), 1
+$Lend0b:
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r0, r5, r0
+ ret r31, (r26), 1
+
+$Large:
+ lda $30, -240($30)
+ stq $9, 8($30)
+ stq $10, 16($30)
+ stq $11, 24($30)
+ stq $12, 32($30)
+ stq $13, 40($30)
+ stq $14, 48($30)
+ stq $15, 56($30)
+
+ and r18, 7, r20 C count for the first loop, 0-7
+ srl r18, 3, r18 C count for unrolled loop
+ bis r31, r31, r0
+ beq r20, $Lunroll
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ subq r20, 1, r20 C size--
+ mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ umulh r2, r19, r0 C r0 = prod_high
+ beq r20, $Lend1b C jump if size was == 1
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ subq r20, 1, r20 C size--
+ addq r5, r3, r3
+ cmpult r3, r5, r4
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ beq r20, $Lend1a C jump if size was == 2
+
+ ALIGN(8)
+$Loop1: mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
+ subq r20, 1, r20 C size--
+ umulh r2, r19, r4 C r4 = cy_limb
+ ldq r2, 0(r17) C r2 = s1_limb
+ addq r17, 8, r17 C s1_ptr++
+ addq r3, r0, r3 C r3 = cy_limb + prod_low
+ cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ addq r5, r0, r0 C combine carries
+ bne r20, $Loop1
+
+$Lend1a:
+ mulq r2, r19, r3 C r3 = prod_low
+ ldq r5, 0(r16) C r5 = *res_ptr
+ addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
+ umulh r2, r19, r4 C r4 = cy_limb
+ addq r3, r0, r3 C r3 = cy_limb + prod_low
+ cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ addq r5, r0, r0 C combine carries
+ addq r4, r0, r0 C cy_limb = prod_high + cy
+ br r31, $Lunroll
+$Lend1b:
+ addq r5, r3, r3
+ cmpult r3, r5, r5
+ stq r3, 0(r16)
+ addq r16, 8, r16 C res_ptr++
+ addq r0, r5, r0
+
+$Lunroll:
+ lda r17, -16(r17) C L1 bookkeeping
+ lda r16, -16(r16) C L1 bookkeeping
+ bis r0, r31, r12
+
+C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+ ldq r2, 16(r17) C L1
+ ldq r3, 24(r17) C L1
+ lda r18, -1(r18) C L1 bookkeeping
+ ldq r6, 16(r16) C L1
+ ldq r7, 24(r16) C L1
+ ldq r0, 32(r17) C L1
+ mulq r19, r2, r13 C U1
+ ldq r1, 40(r17) C L1
+ umulh r19, r2, r14 C U1
+ mulq r19, r3, r15 C U1
+ lda r17, 64(r17) C L1 bookkeeping
+ ldq r4, 32(r16) C L1
+ ldq r5, 40(r16) C L1
+ umulh r19, r3, r8 C U1
+ ldq r2, -16(r17) C L1
+ mulq r19, r0, r9 C U1
+ ldq r3, -8(r17) C L1
+ umulh r19, r0, r10 C U1
+ addq r6, r13, r6 C L0 lo + acc
+ mulq r19, r1, r11 C U1
+ cmpult r6, r13, r20 C L0 lo add => carry
+ lda r16, 64(r16) C L1 bookkeeping
+ addq r6, r12, r22 C U0 hi add => answer
+ cmpult r22, r12, r21 C L0 hi add => carry
+ addq r14, r20, r14 C U0 hi mul + carry
+ ldq r6, -16(r16) C L1
+ addq r7, r15, r23 C L0 lo + acc
+ addq r14, r21, r14 C U0 hi mul + carry
+ ldq r7, -8(r16) C L1
+ umulh r19, r1, r12 C U1
+ cmpult r23, r15, r20 C L0 lo add => carry
+ addq r23, r14, r23 C U0 hi add => answer
+ ldq r0, 0(r17) C L1
+ mulq r19, r2, r13 C U1
+ cmpult r23, r14, r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r1, 8(r17) C L1
+ umulh r19, r2, r14 C U1
+ addq r4, r9, r4 C L0 lo + acc
+ stq r22, -48(r16) C L0
+ stq r23, -40(r16) C L1
+ mulq r19, r3, r15 C U1
+ addq r8, r21, r8 C U0 hi mul + carry
+ cmpult r4, r9, r20 C L0 lo add => carry
+ addq r4, r8, r22 C U0 hi add => answer
+ ble r18, $Lend C U1 bookkeeping
+
+C ____ MAIN UNROLLED LOOP ____
+ ALIGN(16)
+$Loop:
+ bis r31, r31, r31 C U1 mt
+ cmpult r22, r8, r21 C L0 hi add => carry
+ addq r10, r20, r10 C U0 hi mul + carry
+ ldq r4, 0(r16) C L1
+
+ bis r31, r31, r31 C U1 mt
+ addq r5, r11, r23 C L0 lo + acc
+ addq r10, r21, r10 C L0 hi mul + carry
+ ldq r5, 8(r16) C L1
+
+ umulh r19, r3, r8 C U1
+ cmpult r23, r11, r20 C L0 lo add => carry
+ addq r23, r10, r23 C U0 hi add => answer
+ ldq r2, 16(r17) C L1
+
+ mulq r19, r0, r9 C U1
+ cmpult r23, r10, r21 C L0 hi add => carry
+ addq r12, r20, r12 C U0 hi mul + carry
+ ldq r3, 24(r17) C L1
+
+ umulh r19, r0, r10 C U1
+ addq r6, r13, r6 C L0 lo + acc
+ stq r22, -32(r16) C L0
+ stq r23, -24(r16) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq r19, r1, r11 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r12, r21, r12 C U0 hi mul + carry
+
+ cmpult r6, r13, r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda r18, -1(r18) C L1 bookkeeping
+ addq r6, r12, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ cmpult r22, r12, r21 C L0 hi add => carry
+ addq r14, r20, r14 C U0 hi mul + carry
+ ldq r6, 16(r16) C L1
+
+ bis r31, r31, r31 C U1 mt
+ addq r7, r15, r23 C L0 lo + acc
+ addq r14, r21, r14 C U0 hi mul + carry
+ ldq r7, 24(r16) C L1
+
+ umulh r19, r1, r12 C U1
+ cmpult r23, r15, r20 C L0 lo add => carry
+ addq r23, r14, r23 C U0 hi add => answer
+ ldq r0, 32(r17) C L1
+
+ mulq r19, r2, r13 C U1
+ cmpult r23, r14, r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r1, 40(r17) C L1
+
+ umulh r19, r2, r14 C U1
+ addq r4, r9, r4 C U0 lo + acc
+ stq r22, -16(r16) C L0
+ stq r23, -8(r16) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq r19, r3, r15 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r8, r21, r8 C L0 hi mul + carry
+
+ cmpult r4, r9, r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda r17, 64(r17) C L1 bookkeeping
+ addq r4, r8, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ cmpult r22, r8, r21 C L0 hi add => carry
+ addq r10, r20, r10 C U0 hi mul + carry
+ ldq r4, 32(r16) C L1
+
+ bis r31, r31, r31 C U1 mt
+ addq r5, r11, r23 C L0 lo + acc
+ addq r10, r21, r10 C L0 hi mul + carry
+ ldq r5, 40(r16) C L1
+
+ umulh r19, r3, r8 C U1
+ cmpult r23, r11, r20 C L0 lo add => carry
+ addq r23, r10, r23 C U0 hi add => answer
+ ldq r2, -16(r17) C L1
+
+ mulq r19, r0, r9 C U1
+ cmpult r23, r10, r21 C L0 hi add => carry
+ addq r12, r20, r12 C U0 hi mul + carry
+ ldq r3, -8(r17) C L1
+
+ umulh r19, r0, r10 C U1
+ addq r6, r13, r6 C L0 lo + acc
+ stq r22, 0(r16) C L0
+ stq r23, 8(r16) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq r19, r1, r11 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r12, r21, r12 C U0 hi mul + carry
+
+ cmpult r6, r13, r20 C L0 lo add => carry
+ bis r31, r31, r31 C U1 mt
+ lda r16, 64(r16) C L1 bookkeeping
+ addq r6, r12, r22 C U0 hi add => answer
+
+ bis r31, r31, r31 C U1 mt
+ cmpult r22, r12, r21 C L0 hi add => carry
+ addq r14, r20, r14 C U0 hi mul + carry
+ ldq r6, -16(r16) C L1
+
+ bis r31, r31, r31 C U1 mt
+ addq r7, r15, r23 C L0 lo + acc
+ addq r14, r21, r14 C U0 hi mul + carry
+ ldq r7, -8(r16) C L1
+
+ umulh r19, r1, r12 C U1
+ cmpult r23, r15, r20 C L0 lo add => carry
+ addq r23, r14, r23 C U0 hi add => answer
+ ldq r0, 0(r17) C L1
+
+ mulq r19, r2, r13 C U1
+ cmpult r23, r14, r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ ldq r1, 8(r17) C L1
+
+ umulh r19, r2, r14 C U1
+ addq r4, r9, r4 C L0 lo + acc
+ stq r22, -48(r16) C L0
+ stq r23, -40(r16) C L1
+
+ bis r31, r31, r31 C L0 st slosh
+ mulq r19, r3, r15 C U1
+ bis r31, r31, r31 C L1 st slosh
+ addq r8, r21, r8 C U0 hi mul + carry
+
+ cmpult r4, r9, r20 C L0 lo add => carry
+ addq r4, r8, r22 C U0 hi add => answer
+ bis r31, r31, r31 C L1 mt
+ bgt r18, $Loop C U1 bookkeeping
+
+C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+ cmpult r22, r8, r21 C L0 hi add => carry
+ addq r10, r20, r10 C U0 hi mul + carry
+ ldq r4, 0(r16) C L1
+ addq r5, r11, r23 C L0 lo + acc
+ addq r10, r21, r10 C L0 hi mul + carry
+ ldq r5, 8(r16) C L1
+ umulh r19, r3, r8 C U1
+ cmpult r23, r11, r20 C L0 lo add => carry
+ addq r23, r10, r23 C U0 hi add => answer
+ mulq r19, r0, r9 C U1
+ cmpult r23, r10, r21 C L0 hi add => carry
+ addq r12, r20, r12 C U0 hi mul + carry
+ umulh r19, r0, r10 C U1
+ addq r6, r13, r6 C L0 lo + acc
+ stq r22, -32(r16) C L0
+ stq r23, -24(r16) C L1
+ mulq r19, r1, r11 C U1
+ addq r12, r21, r12 C U0 hi mul + carry
+ cmpult r6, r13, r20 C L0 lo add => carry
+ addq r6, r12, r22 C U0 hi add => answer
+ cmpult r22, r12, r21 C L0 hi add => carry
+ addq r14, r20, r14 C U0 hi mul + carry
+ addq r7, r15, r23 C L0 lo + acc
+ addq r14, r21, r14 C U0 hi mul + carry
+ umulh r19, r1, r12 C U1
+ cmpult r23, r15, r20 C L0 lo add => carry
+ addq r23, r14, r23 C U0 hi add => answer
+ cmpult r23, r14, r21 C L0 hi add => carry
+ addq r8, r20, r8 C U0 hi mul + carry
+ addq r4, r9, r4 C U0 lo + acc
+ stq r22, -16(r16) C L0
+ stq r23, -8(r16) C L1
+ bis r31, r31, r31 C L0 st slosh
+ addq r8, r21, r8 C L0 hi mul + carry
+ cmpult r4, r9, r20 C L0 lo add => carry
+ addq r4, r8, r22 C U0 hi add => answer
+ cmpult r22, r8, r21 C L0 hi add => carry
+ addq r10, r20, r10 C U0 hi mul + carry
+ addq r5, r11, r23 C L0 lo + acc
+ addq r10, r21, r10 C L0 hi mul + carry
+ cmpult r23, r11, r20 C L0 lo add => carry
+ addq r23, r10, r23 C U0 hi add => answer
+ cmpult r23, r10, r21 C L0 hi add => carry
+ addq r12, r20, r12 C U0 hi mul + carry
+ stq r22, 0(r16) C L0
+ stq r23, 8(r16) C L1
+ addq r12, r21, r0 C U0 hi mul + carry
+
+ ldq $9, 8($30)
+ ldq $10, 16($30)
+ ldq $11, 24($30)
+ ldq $12, 32($30)
+ ldq $13, 40($30)
+ ldq $14, 48($30)
+ ldq $15, 56($30)
+ lda $30, 240($30)
+ ret r31, (r26), 1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev6/gmp-mparam.h b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h
new file mode 100644
index 0000000000..7ea20577f8
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h
@@ -0,0 +1,62 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Generated by tuneup.c, 2000-08-02. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 47
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 70
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 94
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 101
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 33
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 70
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 29
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 33
+#endif
diff --git a/rts/gmp/mpn/alpha/gmp-mparam.h b/rts/gmp/mpn/alpha/gmp-mparam.h
new file mode 100644
index 0000000000..054ff2fe5f
--- /dev/null
+++ b/rts/gmp/mpn/alpha/gmp-mparam.h
@@ -0,0 +1,64 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the 21164 family. The 21264 will require
+ different values, since it has such quick multiplication. */
+/* Generated by tuneup.c, 2000-07-19. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 22
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 53
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 31
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 47
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 64
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 17
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 4
+#endif
diff --git a/rts/gmp/mpn/alpha/invert_limb.asm b/rts/gmp/mpn/alpha/invert_limb.asm
new file mode 100644
index 0000000000..a921b32b3f
--- /dev/null
+++ b/rts/gmp/mpn/alpha/invert_limb.asm
@@ -0,0 +1,345 @@
+dnl Alpha mpn_invert_limb -- Invert a normalized limb.
+
+dnl Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+dnl
+dnl This is based on sophie:/gmp-stuff/dbg-inv-limb.c.
+dnl The ideas are due to Peter L. Montgomery
+dnl
+dnl The table below uses 4096 bytes. The file mentioned above has an
+dnl alternative function that doesn't require the table, but it runs 50%
+dnl slower than this.
+
+include(`../config.m4')
+
+ASM_START()
+
+FLOAT64($C36,9223372036854775808.0) C 2^63
+
+PROLOGUE_GP(mpn_invert_limb)
+ lda r30,-16(r30)
+ addq r16,r16,r1
+ bne r1,$73
+ lda r0,-1
+ br r31,$Lend
+$73:
+ srl r16,1,r1
+ stq r1,0(r30)
+ ldt f11,0(r30)
+ cvtqt f11,f1
+ lda r1,$C36
+ ldt f10,0(r1)
+ divt f10,f1,f10
+ lda r2,$invtab-4096
+ srl r16,52,r1
+ addq r1,r1,r1
+ addq r1,r2,r1
+ bic r1,6,r2
+ ldq r2,0(r2)
+ bic r1,1,r1
+ extwl r2,r1,r2
+ sll r2,48,r0
+ umulh r16,r0,r1
+ addq r16,r1,r3
+ stq r3,0(r30)
+ ldt f11,0(r30)
+ cvtqt f11,f1
+ mult f1,f10,f1
+ cvttqc f1,f1
+ stt f1,0(r30)
+ ldq r4,0(r30)
+ subq r0,r4,r0
+ umulh r16,r0,r1
+ mulq r16,r0,r2
+ addq r16,r1,r3
+ bge r3,$Loop2
+$Loop1: addq r2,r16,r2
+ cmpult r2,r16,r1
+ addq r3,r1,r3
+ addq r0,1,r0
+ blt r3,$Loop1
+$Loop2: cmpult r2,r16,r1
+ subq r0,1,r0
+ subq r3,r1,r3
+ subq r2,r16,r2
+ bge r3,$Loop2
+$Lend:
+ lda r30,16(r30)
+ ret r31,(r26),1
+EPILOGUE(mpn_invert_limb)
+DATASTART(`$invtab',4)
+ .word 0xffff,0xffc0,0xff80,0xff40,0xff00,0xfec0,0xfe81,0xfe41
+ .word 0xfe01,0xfdc2,0xfd83,0xfd43,0xfd04,0xfcc5,0xfc86,0xfc46
+ .word 0xfc07,0xfbc8,0xfb8a,0xfb4b,0xfb0c,0xfacd,0xfa8e,0xfa50
+ .word 0xfa11,0xf9d3,0xf994,0xf956,0xf918,0xf8d9,0xf89b,0xf85d
+ .word 0xf81f,0xf7e1,0xf7a3,0xf765,0xf727,0xf6ea,0xf6ac,0xf66e
+ .word 0xf631,0xf5f3,0xf5b6,0xf578,0xf53b,0xf4fd,0xf4c0,0xf483
+ .word 0xf446,0xf409,0xf3cc,0xf38f,0xf352,0xf315,0xf2d8,0xf29c
+ .word 0xf25f,0xf222,0xf1e6,0xf1a9,0xf16d,0xf130,0xf0f4,0xf0b8
+ .word 0xf07c,0xf03f,0xf003,0xefc7,0xef8b,0xef4f,0xef14,0xeed8
+ .word 0xee9c,0xee60,0xee25,0xede9,0xedae,0xed72,0xed37,0xecfb
+ .word 0xecc0,0xec85,0xec4a,0xec0e,0xebd3,0xeb98,0xeb5d,0xeb22
+ .word 0xeae8,0xeaad,0xea72,0xea37,0xe9fd,0xe9c2,0xe988,0xe94d
+ .word 0xe913,0xe8d8,0xe89e,0xe864,0xe829,0xe7ef,0xe7b5,0xe77b
+ .word 0xe741,0xe707,0xe6cd,0xe694,0xe65a,0xe620,0xe5e6,0xe5ad
+ .word 0xe573,0xe53a,0xe500,0xe4c7,0xe48d,0xe454,0xe41b,0xe3e2
+ .word 0xe3a9,0xe370,0xe336,0xe2fd,0xe2c5,0xe28c,0xe253,0xe21a
+ .word 0xe1e1,0xe1a9,0xe170,0xe138,0xe0ff,0xe0c7,0xe08e,0xe056
+ .word 0xe01e,0xdfe5,0xdfad,0xdf75,0xdf3d,0xdf05,0xdecd,0xde95
+ .word 0xde5d,0xde25,0xdded,0xddb6,0xdd7e,0xdd46,0xdd0f,0xdcd7
+ .word 0xdca0,0xdc68,0xdc31,0xdbf9,0xdbc2,0xdb8b,0xdb54,0xdb1d
+ .word 0xdae6,0xdaae,0xda78,0xda41,0xda0a,0xd9d3,0xd99c,0xd965
+ .word 0xd92f,0xd8f8,0xd8c1,0xd88b,0xd854,0xd81e,0xd7e8,0xd7b1
+ .word 0xd77b,0xd745,0xd70e,0xd6d8,0xd6a2,0xd66c,0xd636,0xd600
+ .word 0xd5ca,0xd594,0xd55f,0xd529,0xd4f3,0xd4bd,0xd488,0xd452
+ .word 0xd41d,0xd3e7,0xd3b2,0xd37c,0xd347,0xd312,0xd2dd,0xd2a7
+ .word 0xd272,0xd23d,0xd208,0xd1d3,0xd19e,0xd169,0xd134,0xd100
+ .word 0xd0cb,0xd096,0xd061,0xd02d,0xcff8,0xcfc4,0xcf8f,0xcf5b
+ .word 0xcf26,0xcef2,0xcebe,0xce89,0xce55,0xce21,0xcded,0xcdb9
+ .word 0xcd85,0xcd51,0xcd1d,0xcce9,0xccb5,0xcc81,0xcc4e,0xcc1a
+ .word 0xcbe6,0xcbb3,0xcb7f,0xcb4c,0xcb18,0xcae5,0xcab1,0xca7e
+ .word 0xca4b,0xca17,0xc9e4,0xc9b1,0xc97e,0xc94b,0xc918,0xc8e5
+ .word 0xc8b2,0xc87f,0xc84c,0xc819,0xc7e7,0xc7b4,0xc781,0xc74f
+ .word 0xc71c,0xc6e9,0xc6b7,0xc684,0xc652,0xc620,0xc5ed,0xc5bb
+ .word 0xc589,0xc557,0xc524,0xc4f2,0xc4c0,0xc48e,0xc45c,0xc42a
+ .word 0xc3f8,0xc3c7,0xc395,0xc363,0xc331,0xc300,0xc2ce,0xc29c
+ .word 0xc26b,0xc239,0xc208,0xc1d6,0xc1a5,0xc174,0xc142,0xc111
+ .word 0xc0e0,0xc0af,0xc07e,0xc04d,0xc01c,0xbfeb,0xbfba,0xbf89
+ .word 0xbf58,0xbf27,0xbef6,0xbec5,0xbe95,0xbe64,0xbe33,0xbe03
+ .word 0xbdd2,0xbda2,0xbd71,0xbd41,0xbd10,0xbce0,0xbcb0,0xbc80
+ .word 0xbc4f,0xbc1f,0xbbef,0xbbbf,0xbb8f,0xbb5f,0xbb2f,0xbaff
+ .word 0xbacf,0xba9f,0xba6f,0xba40,0xba10,0xb9e0,0xb9b1,0xb981
+ .word 0xb951,0xb922,0xb8f2,0xb8c3,0xb894,0xb864,0xb835,0xb806
+ .word 0xb7d6,0xb7a7,0xb778,0xb749,0xb71a,0xb6eb,0xb6bc,0xb68d
+ .word 0xb65e,0xb62f,0xb600,0xb5d1,0xb5a2,0xb574,0xb545,0xb516
+ .word 0xb4e8,0xb4b9,0xb48a,0xb45c,0xb42e,0xb3ff,0xb3d1,0xb3a2
+ .word 0xb374,0xb346,0xb318,0xb2e9,0xb2bb,0xb28d,0xb25f,0xb231
+ .word 0xb203,0xb1d5,0xb1a7,0xb179,0xb14b,0xb11d,0xb0f0,0xb0c2
+ .word 0xb094,0xb067,0xb039,0xb00b,0xafde,0xafb0,0xaf83,0xaf55
+ .word 0xaf28,0xaefb,0xaecd,0xaea0,0xae73,0xae45,0xae18,0xadeb
+ .word 0xadbe,0xad91,0xad64,0xad37,0xad0a,0xacdd,0xacb0,0xac83
+ .word 0xac57,0xac2a,0xabfd,0xabd0,0xaba4,0xab77,0xab4a,0xab1e
+ .word 0xaaf1,0xaac5,0xaa98,0xaa6c,0xaa40,0xaa13,0xa9e7,0xa9bb
+ .word 0xa98e,0xa962,0xa936,0xa90a,0xa8de,0xa8b2,0xa886,0xa85a
+ .word 0xa82e,0xa802,0xa7d6,0xa7aa,0xa77e,0xa753,0xa727,0xa6fb
+ .word 0xa6d0,0xa6a4,0xa678,0xa64d,0xa621,0xa5f6,0xa5ca,0xa59f
+ .word 0xa574,0xa548,0xa51d,0xa4f2,0xa4c6,0xa49b,0xa470,0xa445
+ .word 0xa41a,0xa3ef,0xa3c4,0xa399,0xa36e,0xa343,0xa318,0xa2ed
+ .word 0xa2c2,0xa297,0xa26d,0xa242,0xa217,0xa1ed,0xa1c2,0xa197
+ .word 0xa16d,0xa142,0xa118,0xa0ed,0xa0c3,0xa098,0xa06e,0xa044
+ .word 0xa01a,0x9fef,0x9fc5,0x9f9b,0x9f71,0x9f47,0x9f1c,0x9ef2
+ .word 0x9ec8,0x9e9e,0x9e74,0x9e4b,0x9e21,0x9df7,0x9dcd,0x9da3
+ .word 0x9d79,0x9d50,0x9d26,0x9cfc,0x9cd3,0x9ca9,0x9c80,0x9c56
+ .word 0x9c2d,0x9c03,0x9bda,0x9bb0,0x9b87,0x9b5e,0x9b34,0x9b0b
+ .word 0x9ae2,0x9ab9,0x9a8f,0x9a66,0x9a3d,0x9a14,0x99eb,0x99c2
+ .word 0x9999,0x9970,0x9947,0x991e,0x98f6,0x98cd,0x98a4,0x987b
+ .word 0x9852,0x982a,0x9801,0x97d8,0x97b0,0x9787,0x975f,0x9736
+ .word 0x970e,0x96e5,0x96bd,0x9695,0x966c,0x9644,0x961c,0x95f3
+ .word 0x95cb,0x95a3,0x957b,0x9553,0x952b,0x9503,0x94db,0x94b3
+ .word 0x948b,0x9463,0x943b,0x9413,0x93eb,0x93c3,0x939b,0x9374
+ .word 0x934c,0x9324,0x92fd,0x92d5,0x92ad,0x9286,0x925e,0x9237
+ .word 0x920f,0x91e8,0x91c0,0x9199,0x9172,0x914a,0x9123,0x90fc
+ .word 0x90d4,0x90ad,0x9086,0x905f,0x9038,0x9011,0x8fea,0x8fc3
+ .word 0x8f9c,0x8f75,0x8f4e,0x8f27,0x8f00,0x8ed9,0x8eb2,0x8e8b
+ .word 0x8e65,0x8e3e,0x8e17,0x8df1,0x8dca,0x8da3,0x8d7d,0x8d56
+ .word 0x8d30,0x8d09,0x8ce3,0x8cbc,0x8c96,0x8c6f,0x8c49,0x8c23
+ .word 0x8bfc,0x8bd6,0x8bb0,0x8b8a,0x8b64,0x8b3d,0x8b17,0x8af1
+ .word 0x8acb,0x8aa5,0x8a7f,0x8a59,0x8a33,0x8a0d,0x89e7,0x89c1
+ .word 0x899c,0x8976,0x8950,0x892a,0x8904,0x88df,0x88b9,0x8893
+ .word 0x886e,0x8848,0x8823,0x87fd,0x87d8,0x87b2,0x878d,0x8767
+ .word 0x8742,0x871d,0x86f7,0x86d2,0x86ad,0x8687,0x8662,0x863d
+ .word 0x8618,0x85f3,0x85ce,0x85a9,0x8583,0x855e,0x8539,0x8514
+ .word 0x84f0,0x84cb,0x84a6,0x8481,0x845c,0x8437,0x8412,0x83ee
+ .word 0x83c9,0x83a4,0x8380,0x835b,0x8336,0x8312,0x82ed,0x82c9
+ .word 0x82a4,0x8280,0x825b,0x8237,0x8212,0x81ee,0x81ca,0x81a5
+ .word 0x8181,0x815d,0x8138,0x8114,0x80f0,0x80cc,0x80a8,0x8084
+ .word 0x8060,0x803c,0x8018,0x7ff4,0x7fd0,0x7fac,0x7f88,0x7f64
+ .word 0x7f40,0x7f1c,0x7ef8,0x7ed4,0x7eb1,0x7e8d,0x7e69,0x7e45
+ .word 0x7e22,0x7dfe,0x7ddb,0x7db7,0x7d93,0x7d70,0x7d4c,0x7d29
+ .word 0x7d05,0x7ce2,0x7cbf,0x7c9b,0x7c78,0x7c55,0x7c31,0x7c0e
+ .word 0x7beb,0x7bc7,0x7ba4,0x7b81,0x7b5e,0x7b3b,0x7b18,0x7af5
+ .word 0x7ad2,0x7aaf,0x7a8c,0x7a69,0x7a46,0x7a23,0x7a00,0x79dd
+ .word 0x79ba,0x7997,0x7975,0x7952,0x792f,0x790c,0x78ea,0x78c7
+ .word 0x78a4,0x7882,0x785f,0x783c,0x781a,0x77f7,0x77d5,0x77b2
+ .word 0x7790,0x776e,0x774b,0x7729,0x7706,0x76e4,0x76c2,0x76a0
+ .word 0x767d,0x765b,0x7639,0x7617,0x75f5,0x75d2,0x75b0,0x758e
+ .word 0x756c,0x754a,0x7528,0x7506,0x74e4,0x74c2,0x74a0,0x747e
+ .word 0x745d,0x743b,0x7419,0x73f7,0x73d5,0x73b4,0x7392,0x7370
+ .word 0x734f,0x732d,0x730b,0x72ea,0x72c8,0x72a7,0x7285,0x7264
+ .word 0x7242,0x7221,0x71ff,0x71de,0x71bc,0x719b,0x717a,0x7158
+ .word 0x7137,0x7116,0x70f5,0x70d3,0x70b2,0x7091,0x7070,0x704f
+ .word 0x702e,0x700c,0x6feb,0x6fca,0x6fa9,0x6f88,0x6f67,0x6f46
+ .word 0x6f26,0x6f05,0x6ee4,0x6ec3,0x6ea2,0x6e81,0x6e60,0x6e40
+ .word 0x6e1f,0x6dfe,0x6dde,0x6dbd,0x6d9c,0x6d7c,0x6d5b,0x6d3a
+ .word 0x6d1a,0x6cf9,0x6cd9,0x6cb8,0x6c98,0x6c77,0x6c57,0x6c37
+ .word 0x6c16,0x6bf6,0x6bd6,0x6bb5,0x6b95,0x6b75,0x6b54,0x6b34
+ .word 0x6b14,0x6af4,0x6ad4,0x6ab4,0x6a94,0x6a73,0x6a53,0x6a33
+ .word 0x6a13,0x69f3,0x69d3,0x69b3,0x6993,0x6974,0x6954,0x6934
+ .word 0x6914,0x68f4,0x68d4,0x68b5,0x6895,0x6875,0x6855,0x6836
+ .word 0x6816,0x67f6,0x67d7,0x67b7,0x6798,0x6778,0x6758,0x6739
+ .word 0x6719,0x66fa,0x66db,0x66bb,0x669c,0x667c,0x665d,0x663e
+ .word 0x661e,0x65ff,0x65e0,0x65c0,0x65a1,0x6582,0x6563,0x6544
+ .word 0x6524,0x6505,0x64e6,0x64c7,0x64a8,0x6489,0x646a,0x644b
+ .word 0x642c,0x640d,0x63ee,0x63cf,0x63b0,0x6391,0x6373,0x6354
+ .word 0x6335,0x6316,0x62f7,0x62d9,0x62ba,0x629b,0x627c,0x625e
+ .word 0x623f,0x6221,0x6202,0x61e3,0x61c5,0x61a6,0x6188,0x6169
+ .word 0x614b,0x612c,0x610e,0x60ef,0x60d1,0x60b3,0x6094,0x6076
+ .word 0x6058,0x6039,0x601b,0x5ffd,0x5fdf,0x5fc0,0x5fa2,0x5f84
+ .word 0x5f66,0x5f48,0x5f2a,0x5f0b,0x5eed,0x5ecf,0x5eb1,0x5e93
+ .word 0x5e75,0x5e57,0x5e39,0x5e1b,0x5dfd,0x5de0,0x5dc2,0x5da4
+ .word 0x5d86,0x5d68,0x5d4a,0x5d2d,0x5d0f,0x5cf1,0x5cd3,0x5cb6
+ .word 0x5c98,0x5c7a,0x5c5d,0x5c3f,0x5c21,0x5c04,0x5be6,0x5bc9
+ .word 0x5bab,0x5b8e,0x5b70,0x5b53,0x5b35,0x5b18,0x5afb,0x5add
+ .word 0x5ac0,0x5aa2,0x5a85,0x5a68,0x5a4b,0x5a2d,0x5a10,0x59f3
+ .word 0x59d6,0x59b8,0x599b,0x597e,0x5961,0x5944,0x5927,0x590a
+ .word 0x58ed,0x58d0,0x58b3,0x5896,0x5879,0x585c,0x583f,0x5822
+ .word 0x5805,0x57e8,0x57cb,0x57ae,0x5791,0x5775,0x5758,0x573b
+ .word 0x571e,0x5702,0x56e5,0x56c8,0x56ac,0x568f,0x5672,0x5656
+ .word 0x5639,0x561c,0x5600,0x55e3,0x55c7,0x55aa,0x558e,0x5571
+ .word 0x5555,0x5538,0x551c,0x5500,0x54e3,0x54c7,0x54aa,0x548e
+ .word 0x5472,0x5456,0x5439,0x541d,0x5401,0x53e5,0x53c8,0x53ac
+ .word 0x5390,0x5374,0x5358,0x533c,0x5320,0x5304,0x52e8,0x52cb
+ .word 0x52af,0x5293,0x5277,0x525c,0x5240,0x5224,0x5208,0x51ec
+ .word 0x51d0,0x51b4,0x5198,0x517c,0x5161,0x5145,0x5129,0x510d
+ .word 0x50f2,0x50d6,0x50ba,0x509f,0x5083,0x5067,0x504c,0x5030
+ .word 0x5015,0x4ff9,0x4fdd,0x4fc2,0x4fa6,0x4f8b,0x4f6f,0x4f54
+ .word 0x4f38,0x4f1d,0x4f02,0x4ee6,0x4ecb,0x4eb0,0x4e94,0x4e79
+ .word 0x4e5e,0x4e42,0x4e27,0x4e0c,0x4df0,0x4dd5,0x4dba,0x4d9f
+ .word 0x4d84,0x4d69,0x4d4d,0x4d32,0x4d17,0x4cfc,0x4ce1,0x4cc6
+ .word 0x4cab,0x4c90,0x4c75,0x4c5a,0x4c3f,0x4c24,0x4c09,0x4bee
+ .word 0x4bd3,0x4bb9,0x4b9e,0x4b83,0x4b68,0x4b4d,0x4b32,0x4b18
+ .word 0x4afd,0x4ae2,0x4ac7,0x4aad,0x4a92,0x4a77,0x4a5d,0x4a42
+ .word 0x4a27,0x4a0d,0x49f2,0x49d8,0x49bd,0x49a3,0x4988,0x496e
+ .word 0x4953,0x4939,0x491e,0x4904,0x48e9,0x48cf,0x48b5,0x489a
+ .word 0x4880,0x4865,0x484b,0x4831,0x4817,0x47fc,0x47e2,0x47c8
+ .word 0x47ae,0x4793,0x4779,0x475f,0x4745,0x472b,0x4711,0x46f6
+ .word 0x46dc,0x46c2,0x46a8,0x468e,0x4674,0x465a,0x4640,0x4626
+ .word 0x460c,0x45f2,0x45d8,0x45be,0x45a5,0x458b,0x4571,0x4557
+ .word 0x453d,0x4523,0x4509,0x44f0,0x44d6,0x44bc,0x44a2,0x4489
+ .word 0x446f,0x4455,0x443c,0x4422,0x4408,0x43ef,0x43d5,0x43bc
+ .word 0x43a2,0x4388,0x436f,0x4355,0x433c,0x4322,0x4309,0x42ef
+ .word 0x42d6,0x42bc,0x42a3,0x428a,0x4270,0x4257,0x423d,0x4224
+ .word 0x420b,0x41f2,0x41d8,0x41bf,0x41a6,0x418c,0x4173,0x415a
+ .word 0x4141,0x4128,0x410e,0x40f5,0x40dc,0x40c3,0x40aa,0x4091
+ .word 0x4078,0x405f,0x4046,0x402d,0x4014,0x3ffb,0x3fe2,0x3fc9
+ .word 0x3fb0,0x3f97,0x3f7e,0x3f65,0x3f4c,0x3f33,0x3f1a,0x3f01
+ .word 0x3ee8,0x3ed0,0x3eb7,0x3e9e,0x3e85,0x3e6c,0x3e54,0x3e3b
+ .word 0x3e22,0x3e0a,0x3df1,0x3dd8,0x3dc0,0x3da7,0x3d8e,0x3d76
+ .word 0x3d5d,0x3d45,0x3d2c,0x3d13,0x3cfb,0x3ce2,0x3cca,0x3cb1
+ .word 0x3c99,0x3c80,0x3c68,0x3c50,0x3c37,0x3c1f,0x3c06,0x3bee
+ .word 0x3bd6,0x3bbd,0x3ba5,0x3b8d,0x3b74,0x3b5c,0x3b44,0x3b2b
+ .word 0x3b13,0x3afb,0x3ae3,0x3acb,0x3ab2,0x3a9a,0x3a82,0x3a6a
+ .word 0x3a52,0x3a3a,0x3a22,0x3a09,0x39f1,0x39d9,0x39c1,0x39a9
+ .word 0x3991,0x3979,0x3961,0x3949,0x3931,0x3919,0x3901,0x38ea
+ .word 0x38d2,0x38ba,0x38a2,0x388a,0x3872,0x385a,0x3843,0x382b
+ .word 0x3813,0x37fb,0x37e3,0x37cc,0x37b4,0x379c,0x3785,0x376d
+ .word 0x3755,0x373e,0x3726,0x370e,0x36f7,0x36df,0x36c8,0x36b0
+ .word 0x3698,0x3681,0x3669,0x3652,0x363a,0x3623,0x360b,0x35f4
+ .word 0x35dc,0x35c5,0x35ae,0x3596,0x357f,0x3567,0x3550,0x3539
+ .word 0x3521,0x350a,0x34f3,0x34db,0x34c4,0x34ad,0x3496,0x347e
+ .word 0x3467,0x3450,0x3439,0x3422,0x340a,0x33f3,0x33dc,0x33c5
+ .word 0x33ae,0x3397,0x3380,0x3368,0x3351,0x333a,0x3323,0x330c
+ .word 0x32f5,0x32de,0x32c7,0x32b0,0x3299,0x3282,0x326c,0x3255
+ .word 0x323e,0x3227,0x3210,0x31f9,0x31e2,0x31cb,0x31b5,0x319e
+ .word 0x3187,0x3170,0x3159,0x3143,0x312c,0x3115,0x30fe,0x30e8
+ .word 0x30d1,0x30ba,0x30a4,0x308d,0x3076,0x3060,0x3049,0x3033
+ .word 0x301c,0x3005,0x2fef,0x2fd8,0x2fc2,0x2fab,0x2f95,0x2f7e
+ .word 0x2f68,0x2f51,0x2f3b,0x2f24,0x2f0e,0x2ef8,0x2ee1,0x2ecb
+ .word 0x2eb4,0x2e9e,0x2e88,0x2e71,0x2e5b,0x2e45,0x2e2e,0x2e18
+ .word 0x2e02,0x2dec,0x2dd5,0x2dbf,0x2da9,0x2d93,0x2d7c,0x2d66
+ .word 0x2d50,0x2d3a,0x2d24,0x2d0e,0x2cf8,0x2ce1,0x2ccb,0x2cb5
+ .word 0x2c9f,0x2c89,0x2c73,0x2c5d,0x2c47,0x2c31,0x2c1b,0x2c05
+ .word 0x2bef,0x2bd9,0x2bc3,0x2bad,0x2b97,0x2b81,0x2b6c,0x2b56
+ .word 0x2b40,0x2b2a,0x2b14,0x2afe,0x2ae8,0x2ad3,0x2abd,0x2aa7
+ .word 0x2a91,0x2a7c,0x2a66,0x2a50,0x2a3a,0x2a25,0x2a0f,0x29f9
+ .word 0x29e4,0x29ce,0x29b8,0x29a3,0x298d,0x2977,0x2962,0x294c
+ .word 0x2937,0x2921,0x290c,0x28f6,0x28e0,0x28cb,0x28b5,0x28a0
+ .word 0x288b,0x2875,0x2860,0x284a,0x2835,0x281f,0x280a,0x27f5
+ .word 0x27df,0x27ca,0x27b4,0x279f,0x278a,0x2774,0x275f,0x274a
+ .word 0x2735,0x271f,0x270a,0x26f5,0x26e0,0x26ca,0x26b5,0x26a0
+ .word 0x268b,0x2676,0x2660,0x264b,0x2636,0x2621,0x260c,0x25f7
+ .word 0x25e2,0x25cd,0x25b8,0x25a2,0x258d,0x2578,0x2563,0x254e
+ .word 0x2539,0x2524,0x250f,0x24fa,0x24e5,0x24d1,0x24bc,0x24a7
+ .word 0x2492,0x247d,0x2468,0x2453,0x243e,0x2429,0x2415,0x2400
+ .word 0x23eb,0x23d6,0x23c1,0x23ad,0x2398,0x2383,0x236e,0x235a
+ .word 0x2345,0x2330,0x231c,0x2307,0x22f2,0x22dd,0x22c9,0x22b4
+ .word 0x22a0,0x228b,0x2276,0x2262,0x224d,0x2239,0x2224,0x2210
+ .word 0x21fb,0x21e6,0x21d2,0x21bd,0x21a9,0x2194,0x2180,0x216c
+ .word 0x2157,0x2143,0x212e,0x211a,0x2105,0x20f1,0x20dd,0x20c8
+ .word 0x20b4,0x20a0,0x208b,0x2077,0x2063,0x204e,0x203a,0x2026
+ .word 0x2012,0x1ffd,0x1fe9,0x1fd5,0x1fc1,0x1fac,0x1f98,0x1f84
+ .word 0x1f70,0x1f5c,0x1f47,0x1f33,0x1f1f,0x1f0b,0x1ef7,0x1ee3
+ .word 0x1ecf,0x1ebb,0x1ea7,0x1e93,0x1e7f,0x1e6a,0x1e56,0x1e42
+ .word 0x1e2e,0x1e1a,0x1e06,0x1df3,0x1ddf,0x1dcb,0x1db7,0x1da3
+ .word 0x1d8f,0x1d7b,0x1d67,0x1d53,0x1d3f,0x1d2b,0x1d18,0x1d04
+ .word 0x1cf0,0x1cdc,0x1cc8,0x1cb5,0x1ca1,0x1c8d,0x1c79,0x1c65
+ .word 0x1c52,0x1c3e,0x1c2a,0x1c17,0x1c03,0x1bef,0x1bdb,0x1bc8
+ .word 0x1bb4,0x1ba0,0x1b8d,0x1b79,0x1b66,0x1b52,0x1b3e,0x1b2b
+ .word 0x1b17,0x1b04,0x1af0,0x1add,0x1ac9,0x1ab6,0x1aa2,0x1a8f
+ .word 0x1a7b,0x1a68,0x1a54,0x1a41,0x1a2d,0x1a1a,0x1a06,0x19f3
+ .word 0x19e0,0x19cc,0x19b9,0x19a5,0x1992,0x197f,0x196b,0x1958
+ .word 0x1945,0x1931,0x191e,0x190b,0x18f8,0x18e4,0x18d1,0x18be
+ .word 0x18ab,0x1897,0x1884,0x1871,0x185e,0x184b,0x1837,0x1824
+ .word 0x1811,0x17fe,0x17eb,0x17d8,0x17c4,0x17b1,0x179e,0x178b
+ .word 0x1778,0x1765,0x1752,0x173f,0x172c,0x1719,0x1706,0x16f3
+ .word 0x16e0,0x16cd,0x16ba,0x16a7,0x1694,0x1681,0x166e,0x165b
+ .word 0x1648,0x1635,0x1623,0x1610,0x15fd,0x15ea,0x15d7,0x15c4
+ .word 0x15b1,0x159f,0x158c,0x1579,0x1566,0x1553,0x1541,0x152e
+ .word 0x151b,0x1508,0x14f6,0x14e3,0x14d0,0x14bd,0x14ab,0x1498
+ .word 0x1485,0x1473,0x1460,0x144d,0x143b,0x1428,0x1416,0x1403
+ .word 0x13f0,0x13de,0x13cb,0x13b9,0x13a6,0x1394,0x1381,0x136f
+ .word 0x135c,0x1349,0x1337,0x1325,0x1312,0x1300,0x12ed,0x12db
+ .word 0x12c8,0x12b6,0x12a3,0x1291,0x127f,0x126c,0x125a,0x1247
+ .word 0x1235,0x1223,0x1210,0x11fe,0x11ec,0x11d9,0x11c7,0x11b5
+ .word 0x11a3,0x1190,0x117e,0x116c,0x1159,0x1147,0x1135,0x1123
+ .word 0x1111,0x10fe,0x10ec,0x10da,0x10c8,0x10b6,0x10a4,0x1091
+ .word 0x107f,0x106d,0x105b,0x1049,0x1037,0x1025,0x1013,0x1001
+ .word 0x0fef,0x0fdc,0x0fca,0x0fb8,0x0fa6,0x0f94,0x0f82,0x0f70
+ .word 0x0f5e,0x0f4c,0x0f3a,0x0f28,0x0f17,0x0f05,0x0ef3,0x0ee1
+ .word 0x0ecf,0x0ebd,0x0eab,0x0e99,0x0e87,0x0e75,0x0e64,0x0e52
+ .word 0x0e40,0x0e2e,0x0e1c,0x0e0a,0x0df9,0x0de7,0x0dd5,0x0dc3
+ .word 0x0db2,0x0da0,0x0d8e,0x0d7c,0x0d6b,0x0d59,0x0d47,0x0d35
+ .word 0x0d24,0x0d12,0x0d00,0x0cef,0x0cdd,0x0ccb,0x0cba,0x0ca8
+ .word 0x0c97,0x0c85,0x0c73,0x0c62,0x0c50,0x0c3f,0x0c2d,0x0c1c
+ .word 0x0c0a,0x0bf8,0x0be7,0x0bd5,0x0bc4,0x0bb2,0x0ba1,0x0b8f
+ .word 0x0b7e,0x0b6c,0x0b5b,0x0b4a,0x0b38,0x0b27,0x0b15,0x0b04
+ .word 0x0af2,0x0ae1,0x0ad0,0x0abe,0x0aad,0x0a9c,0x0a8a,0x0a79
+ .word 0x0a68,0x0a56,0x0a45,0x0a34,0x0a22,0x0a11,0x0a00,0x09ee
+ .word 0x09dd,0x09cc,0x09bb,0x09a9,0x0998,0x0987,0x0976,0x0965
+ .word 0x0953,0x0942,0x0931,0x0920,0x090f,0x08fe,0x08ec,0x08db
+ .word 0x08ca,0x08b9,0x08a8,0x0897,0x0886,0x0875,0x0864,0x0853
+ .word 0x0842,0x0831,0x081f,0x080e,0x07fd,0x07ec,0x07db,0x07ca
+ .word 0x07b9,0x07a8,0x0798,0x0787,0x0776,0x0765,0x0754,0x0743
+ .word 0x0732,0x0721,0x0710,0x06ff,0x06ee,0x06dd,0x06cd,0x06bc
+ .word 0x06ab,0x069a,0x0689,0x0678,0x0668,0x0657,0x0646,0x0635
+ .word 0x0624,0x0614,0x0603,0x05f2,0x05e1,0x05d1,0x05c0,0x05af
+ .word 0x059e,0x058e,0x057d,0x056c,0x055c,0x054b,0x053a,0x052a
+ .word 0x0519,0x0508,0x04f8,0x04e7,0x04d6,0x04c6,0x04b5,0x04a5
+ .word 0x0494,0x0484,0x0473,0x0462,0x0452,0x0441,0x0431,0x0420
+ .word 0x0410,0x03ff,0x03ef,0x03de,0x03ce,0x03bd,0x03ad,0x039c
+ .word 0x038c,0x037b,0x036b,0x035b,0x034a,0x033a,0x0329,0x0319
+ .word 0x0309,0x02f8,0x02e8,0x02d7,0x02c7,0x02b7,0x02a6,0x0296
+ .word 0x0286,0x0275,0x0265,0x0255,0x0245,0x0234,0x0224,0x0214
+ .word 0x0204,0x01f3,0x01e3,0x01d3,0x01c3,0x01b2,0x01a2,0x0192
+ .word 0x0182,0x0172,0x0161,0x0151,0x0141,0x0131,0x0121,0x0111
+ .word 0x0101,0x00f0,0x00e0,0x00d0,0x00c0,0x00b0,0x00a0,0x0090
+ .word 0x0080,0x0070,0x0060,0x0050,0x0040,0x0030,0x0020,0x0010
+DATAEND()
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/lshift.asm b/rts/gmp/mpn/alpha/lshift.asm
new file mode 100644
index 0000000000..87c46f6fe7
--- /dev/null
+++ b/rts/gmp/mpn/alpha/lshift.asm
@@ -0,0 +1,104 @@
+dnl Alpha mpn_lshift -- Shift a number left.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
+dnl it would take 4 cycles/limb. It should be possible to get down to 3
+dnl cycles/limb since both ldq and stq can be paired with the other used
+dnl instructions. But there are many restrictions in the 21064 pipeline that
+dnl makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl 2. Only aligned instruction pairs can be paired.
+dnl 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ s8addq r18,r17,r17 C make r17 point at end of s1
+ ldq r4,-8(r17) C load first limb
+ subq r17,8,r17
+ subq r31,r19,r7
+ s8addq r18,r16,r16 C make r16 point at end of RES
+ subq r18,1,r18
+ and r18,4-1,r20 C number of limbs in first loop
+ srl r4,r7,r0 C compute function result
+
+ beq r20,$L0
+ subq r18,r20,r18
+
+ ALIGN(8)
+$Loop0:
+ ldq r3,-8(r17)
+ subq r16,8,r16
+ subq r17,8,r17
+ subq r20,1,r20
+ sll r4,r19,r5
+ srl r3,r7,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,0(r16)
+ bne r20,$Loop0
+
+$L0: beq r18,$Lend
+
+ ALIGN(8)
+$Loop: ldq r3,-8(r17)
+ subq r16,32,r16
+ subq r18,4,r18
+ sll r4,r19,r5
+ srl r3,r7,r6
+
+ ldq r4,-16(r17)
+ sll r3,r19,r1
+ bis r5,r6,r8
+ stq r8,24(r16)
+ srl r4,r7,r2
+
+ ldq r3,-24(r17)
+ sll r4,r19,r5
+ bis r1,r2,r8
+ stq r8,16(r16)
+ srl r3,r7,r6
+
+ ldq r4,-32(r17)
+ sll r3,r19,r1
+ bis r5,r6,r8
+ stq r8,8(r16)
+ srl r4,r7,r2
+
+ subq r17,32,r17
+ bis r1,r2,r8
+ stq r8,0(r16)
+
+ bgt r18,$Loop
+
+$Lend: sll r4,r19,r8
+ stq r8,-8(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/mul_1.asm b/rts/gmp/mpn/alpha/mul_1.asm
new file mode 100644
index 0000000000..46b8df34f5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/mul_1.asm
@@ -0,0 +1,71 @@
+dnl Alpha __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl s2_limb r19
+
+dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ subq r18,1,r18 C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ bic r31,r31,r4 C clear cy_limb
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Lend1 C jump if size was == 1
+ ldq r2,8(r17) C r2 = s1_limb
+ subq r18,1,r18 C size--
+ stq r3,0(r16)
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ subq r18,1,r18 C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,16(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ stq r3,8(r16)
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ addq r16,8,r16 C res_ptr++
+ bne r18,$Loop
+
+$Lend2: mulq r2,r19,r3 C r3 = prod_low
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ stq r3,8(r16)
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Lend1: stq r3,0(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/rshift.asm b/rts/gmp/mpn/alpha/rshift.asm
new file mode 100644
index 0000000000..aa25eda54e
--- /dev/null
+++ b/rts/gmp/mpn/alpha/rshift.asm
@@ -0,0 +1,102 @@
+dnl Alpha mpn_rshift -- Shift a number right.
+
+dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl cnt r19
+
+dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
+dnl it would take 4 cycles/limb. It should be possible to get down to 3
+dnl cycles/limb since both ldq and stq can be paired with the other used
+dnl instructions. But there are many restrictions in the 21064 pipeline that
+dnl makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl 2. Only aligned instruction pairs can be paired.
+dnl 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ldq r4,0(r17) C load first limb
+ addq r17,8,r17
+ subq r31,r19,r7
+ subq r18,1,r18
+ and r18,4-1,r20 C number of limbs in first loop
+ sll r4,r7,r0 C compute function result
+
+ beq r20,$L0
+ subq r18,r20,r18
+
+ ALIGN(8)
+$Loop0:
+ ldq r3,0(r17)
+ addq r16,8,r16
+ addq r17,8,r17
+ subq r20,1,r20
+ srl r4,r19,r5
+ sll r3,r7,r6
+ bis r3,r3,r4
+ bis r5,r6,r8
+ stq r8,-8(r16)
+ bne r20,$Loop0
+
+$L0: beq r18,$Lend
+
+ ALIGN(8)
+$Loop: ldq r3,0(r17)
+ addq r16,32,r16
+ subq r18,4,r18
+ srl r4,r19,r5
+ sll r3,r7,r6
+
+ ldq r4,8(r17)
+ srl r3,r19,r1
+ bis r5,r6,r8
+ stq r8,-32(r16)
+ sll r4,r7,r2
+
+ ldq r3,16(r17)
+ srl r4,r19,r5
+ bis r1,r2,r8
+ stq r8,-24(r16)
+ sll r3,r7,r6
+
+ ldq r4,24(r17)
+ srl r3,r19,r1
+ bis r5,r6,r8
+ stq r8,-16(r16)
+ sll r4,r7,r2
+
+ addq r17,32,r17
+ bis r1,r2,r8
+ stq r8,-8(r16)
+
+ bgt r18,$Loop
+
+$Lend: srl r4,r19,r8
+ stq r8,0(r16)
+ ret r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/sub_n.asm b/rts/gmp/mpn/alpha/sub_n.asm
new file mode 100644
index 0000000000..718f657141
--- /dev/null
+++ b/rts/gmp/mpn/alpha/sub_n.asm
@@ -0,0 +1,114 @@
+dnl Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl store difference in a third limb vector.
+
+dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl s2_ptr r18
+dnl size r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ ldq r3,0(r17)
+ ldq r4,0(r18)
+
+ subq r19,1,r19
+ and r19,4-1,r2 C number of limbs in first loop
+ bis r31,r31,r0
+ beq r2,$L0 C if multiple of 4 limbs, skip first loop
+
+ subq r19,r2,r19
+
+$Loop0: subq r2,1,r2
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ addq r17,8,r17
+ addq r18,8,r18
+ bis r5,r5,r3
+ bis r6,r6,r4
+ addq r16,8,r16
+ bne r2,$Loop0
+
+$L0: beq r19,$Lend
+
+ ALIGN(8)
+$Loop: subq r19,4,r19
+
+ ldq r5,8(r17)
+ addq r4,r0,r4
+ ldq r6,8(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+
+ ldq r3,16(r17)
+ addq r6,r0,r6
+ ldq r4,16(r18)
+ cmpult r6,r0,r1
+ subq r5,r6,r6
+ cmpult r5,r6,r0
+ stq r6,8(r16)
+ bis r0,r1,r0
+
+ ldq r5,24(r17)
+ addq r4,r0,r4
+ ldq r6,24(r18)
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,16(r16)
+ bis r0,r1,r0
+
+ ldq r3,32(r17)
+ addq r6,r0,r6
+ ldq r4,32(r18)
+ cmpult r6,r0,r1
+ subq r5,r6,r6
+ cmpult r5,r6,r0
+ stq r6,24(r16)
+ bis r0,r1,r0
+
+ addq r17,32,r17
+ addq r18,32,r18
+ addq r16,32,r16
+ bne r19,$Loop
+
+$Lend: addq r4,r0,r4
+ cmpult r4,r0,r1
+ subq r3,r4,r4
+ cmpult r3,r4,r0
+ stq r4,0(r16)
+ bis r0,r1,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/submul_1.asm b/rts/gmp/mpn/alpha/submul_1.asm
new file mode 100644
index 0000000000..caec1a720b
--- /dev/null
+++ b/rts/gmp/mpn/alpha/submul_1.asm
@@ -0,0 +1,87 @@
+dnl Alpha __gmpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+dnl res_ptr r16
+dnl s1_ptr r17
+dnl size r18
+dnl s2_limb r19
+
+dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ umulh r2,r19,r0 C r0 = prod_high
+ beq r18,$Lend1 C jump if size was == 1
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ subq r18,1,r18 C size--
+ subq r5,r3,r3
+ cmpult r5,r3,r4
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ beq r18,$Lend2 C jump if size was == 2
+
+ ALIGN(8)
+$Loop: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ subq r18,1,r18 C size--
+ umulh r2,r19,r4 C r4 = cy_limb
+ ldq r2,0(r17) C r2 = s1_limb
+ addq r17,8,r17 C s1_ptr++
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r16,8,r16 C res_ptr++
+ addq r5,r0,r0 C combine carries
+ bne r18,$Loop
+
+$Lend2: mulq r2,r19,r3 C r3 = prod_low
+ ldq r5,0(r16) C r5 = *res_ptr
+ addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
+ umulh r2,r19,r4 C r4 = cy_limb
+ addq r3,r0,r3 C r3 = cy_limb + prod_low
+ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
+ subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r5,r0,r0 C combine carries
+ addq r4,r0,r0 C cy_limb = prod_high + cy
+ ret r31,(r26),1
+$Lend1: subq r5,r3,r3
+ cmpult r5,r3,r5
+ stq r3,0(r16)
+ addq r0,r5,r0
+ ret r31,(r26),1
+EPILOGUE(mpn_submul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/udiv_qrnnd.S b/rts/gmp/mpn/alpha/udiv_qrnnd.S
new file mode 100644
index 0000000000..53814bbcb0
--- /dev/null
+++ b/rts/gmp/mpn/alpha/udiv_qrnnd.S
@@ -0,0 +1,151 @@
+ # Alpha 21064 __udiv_qrnnd
+
+ # Copyright (C) 1992, 1994, 1995, 1997, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __gmpn_udiv_qrnnd
+ .ent __gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd:
+ .frame $30,0,$26,0
+ .prologue 0
+#define cnt $2
+#define tmp $3
+#define rem_ptr $16
+#define n1 $17
+#define n0 $18
+#define d $19
+#define qb $20
+
+ ldiq cnt,16
+ blt d,.Largedivisor
+
+.Loop1: cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule d,n1,qb
+ subq n1,d,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule d,n1,qb
+ subq n1,d,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule d,n1,qb
+ subq n1,d,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule d,n1,qb
+ subq n1,d,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ subq cnt,1,cnt
+ bgt cnt,.Loop1
+ stq n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+.Largedivisor:
+ and n0,1,$4
+
+ srl n0,1,n0
+ sll n1,63,tmp
+ or tmp,n0,n0
+ srl n1,1,n1
+
+ and d,1,$6
+ srl d,1,$5
+ addq $5,$6,$5
+
+.Loop2: cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule $5,n1,qb
+ subq n1,$5,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule $5,n1,qb
+ subq n1,$5,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule $5,n1,qb
+ subq n1,$5,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addq n1,n1,n1
+ bis n1,tmp,n1
+ addq n0,n0,n0
+ cmpule $5,n1,qb
+ subq n1,$5,tmp
+ cmovne qb,tmp,n1
+ bis n0,qb,n0
+ subq cnt,1,cnt
+ bgt cnt,.Loop2
+
+ addq n1,n1,n1
+ addq $4,n1,n1
+ bne $6,.LOdd
+ stq n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+.LOdd:
+ /* q' in n0. r' in n1 */
+ addq n1,n0,n1
+ cmpult n1,n0,tmp # tmp := carry from addq
+ beq tmp,.LLp6
+ addq n0,1,n0
+ subq n1,d,n1
+.LLp6: cmpult n1,d,tmp
+ bne tmp,.LLp7
+ addq n0,1,n0
+ subq n1,d,n1
+.LLp7:
+ stq n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+ .end __gmpn_udiv_qrnnd
diff --git a/rts/gmp/mpn/alpha/umul.asm b/rts/gmp/mpn/alpha/umul.asm
new file mode 100644
index 0000000000..44428ed5f5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/umul.asm
@@ -0,0 +1,39 @@
+dnl Currently unused.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+ .set noreorder
+ .set volatile
+ .set noat
+
+.text
+ .align 3
+ .globl __umul_ppmm
+ .ent __umul_ppmm
+__umul_ppmm:
+__umul_ppmm..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+ mulq $17,$18,$1
+ umulh $17,$18,$0
+ stq $1,0($16)
+ ret $31,($26),1
+ .end __umul_ppmm
diff --git a/rts/gmp/mpn/alpha/unicos.m4 b/rts/gmp/mpn/alpha/unicos.m4
new file mode 100644
index 0000000000..7ff26c090c
--- /dev/null
+++ b/rts/gmp/mpn/alpha/unicos.m4
@@ -0,0 +1,63 @@
+divert(-1)
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+define(`ASM_START',
+ `.ident dummy')
+
+define(`X',`^X$1')
+define(`FLOAT64',
+ `dnl
+ .psect $1@crud,data
+$1: .t_floating $2
+ .endp')
+
+define(`PROLOGUE',
+ `dnl
+ .stack 192 ; What does this mean? Only Cray knows.
+ .psect $1@code,code,cache
+$1::')
+define(`PROLOGUE_GP', `PROLOGUE($1)')
+
+define(`EPILOGUE',
+ `dnl
+ .endp')
+
+define(`DATASTART',
+ `dnl
+ .psect $1@crud,data
+$1:')
+define(`DATAEND',
+ `dnl
+ .endp')
+
+define(`ASM_END',
+ `dnl
+ .end')
+
+define(`unop',`bis r31,r31,r31') ; Unicos assembler lacks unop
+define(`cvttqc',`cvttq/c')
+
+define(`ALIGN',`') ; Unicos assembler seems to align using garbage
+
+divert
+