summaryrefslogtreecommitdiff
path: root/ghc/rts/gmp/mpn
diff options
context:
space:
mode:
authorsimonm <unknown>1998-06-05 14:43:46 +0000
committersimonm <unknown>1998-06-05 14:43:46 +0000
commit7d11a7c78fa988c079a43b092284f255c3788a10 (patch)
tree1ab23dc8bd0e17efe3ad81afcc26a756bcbf63c2 /ghc/rts/gmp/mpn
parenta343265d8f6bf27a84bb16f68f9242b346dc5095 (diff)
downloadhaskell-7d11a7c78fa988c079a43b092284f255c3788a10.tar.gz
[project @ 1998-06-05 14:43:45 by simonm]
Initial revision
Diffstat (limited to 'ghc/rts/gmp/mpn')
-rw-r--r--ghc/rts/gmp/mpn/alpha/add_n.s120
-rw-r--r--ghc/rts/gmp/mpn/alpha/addmul_1.s92
-rw-r--r--ghc/rts/gmp/mpn/alpha/ev5/add_n.s148
-rw-r--r--ghc/rts/gmp/mpn/alpha/ev5/lshift.s174
-rw-r--r--ghc/rts/gmp/mpn/alpha/ev5/rshift.s172
-rw-r--r--ghc/rts/gmp/mpn/alpha/ev5/sub_n.s149
-rw-r--r--ghc/rts/gmp/mpn/alpha/lshift.s109
-rw-r--r--ghc/rts/gmp/mpn/alpha/mul_1.s85
-rw-r--r--ghc/rts/gmp/mpn/alpha/rshift.s107
-rw-r--r--ghc/rts/gmp/mpn/alpha/sub_n.s120
-rw-r--r--ghc/rts/gmp/mpn/alpha/submul_1.s92
11 files changed, 1368 insertions, 0 deletions
diff --git a/ghc/rts/gmp/mpn/alpha/add_n.s b/ghc/rts/gmp/mpn/alpha/add_n.s
new file mode 100644
index 0000000000..426556e398
--- /dev/null
+++ b/ghc/rts/gmp/mpn/alpha/add_n.s
@@ -0,0 +1,120 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ ldq $3,0($17)
+ ldq $4,0($18)
+
+ subq $19,1,$19
+ and $19,4-1,$2 # number of limbs in first loop
+ bis $31,$31,$0
+ beq $2,.L0 # if multiple of 4 limbs, skip first loop
+
+ subq $19,$2,$19
+
+.Loop0: subq $2,1,$2
+ ldq $5,8($17)
+ addq $4,$0,$4
+ ldq $6,8($18)
+ cmpult $4,$0,$1
+ addq $3,$4,$4
+ cmpult $4,$3,$0
+ stq $4,0($16)
+ or $0,$1,$0
+
+ addq $17,8,$17
+ addq $18,8,$18
+ bis $5,$5,$3
+ bis $6,$6,$4
+ addq $16,8,$16
+ bne $2,.Loop0
+
+.L0: beq $19,.Lend
+
+ .align 3
+.Loop: subq $19,4,$19
+
+ ldq $5,8($17)
+ addq $4,$0,$4
+ ldq $6,8($18)
+ cmpult $4,$0,$1
+ addq $3,$4,$4
+ cmpult $4,$3,$0
+ stq $4,0($16)
+ or $0,$1,$0
+
+ ldq $3,16($17)
+ addq $6,$0,$6
+ ldq $4,16($18)
+ cmpult $6,$0,$1
+ addq $5,$6,$6
+ cmpult $6,$5,$0
+ stq $6,8($16)
+ or $0,$1,$0
+
+ ldq $5,24($17)
+ addq $4,$0,$4
+ ldq $6,24($18)
+ cmpult $4,$0,$1
+ addq $3,$4,$4
+ cmpult $4,$3,$0
+ stq $4,16($16)
+ or $0,$1,$0
+
+ ldq $3,32($17)
+ addq $6,$0,$6
+ ldq $4,32($18)
+ cmpult $6,$0,$1
+ addq $5,$6,$6
+ cmpult $6,$5,$0
+ stq $6,24($16)
+ or $0,$1,$0
+
+ addq $17,32,$17
+ addq $18,32,$18
+ addq $16,32,$16
+ bne $19,.Loop
+
+.Lend: addq $4,$0,$4
+ cmpult $4,$0,$1
+ addq $3,$4,$4
+ cmpult $4,$3,$0
+ stq $4,0($16)
+ or $0,$1,$0
+ ret $31,($26),1
+
+ .end __mpn_add_n
diff --git a/ghc/rts/gmp/mpn/alpha/addmul_1.s b/ghc/rts/gmp/mpn/alpha/addmul_1.s
new file mode 100644
index 0000000000..048238ae9d
--- /dev/null
+++ b/ghc/rts/gmp/mpn/alpha/addmul_1.s
@@ -0,0 +1,92 @@
+ # Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # s2_limb r19
+
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1 2
+__mpn_addmul_1:
+ .frame $30,0,$26
+
+ ldq $2,0($17) # $2 = s1_limb
+ addq $17,8,$17 # s1_ptr++
+ subq $18,1,$18 # size--
+ mulq $2,$19,$3 # $3 = prod_low
+ ldq $5,0($16) # $5 = *res_ptr
+ umulh $2,$19,$0 # $0 = prod_high
+ beq $18,.Lend1 # jump if size was == 1
+ ldq $2,0($17) # $2 = s1_limb
+ addq $17,8,$17 # s1_ptr++
+ subq $18,1,$18 # size--
+ addq $5,$3,$3
+ cmpult $3,$5,$4
+ stq $3,0($16)
+ addq $16,8,$16 # res_ptr++
+ beq $18,.Lend2 # jump if size was == 2
+
+ .align 3
+.Loop: mulq $2,$19,$3 # $3 = prod_low
+ ldq $5,0($16) # $5 = *res_ptr
+ addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ subq $18,1,$18 # size--
+ umulh $2,$19,$4 # $4 = cy_limb
+ ldq $2,0($17) # $2 = s1_limb
+ addq $17,8,$17 # s1_ptr++
+ addq $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addq $5,$3,$3
+ cmpult $3,$5,$5
+ stq $3,0($16)
+ addq $16,8,$16 # res_ptr++
+ addq $5,$0,$0 # combine carries
+ bne $18,.Loop
+
+.Lend2: mulq $2,$19,$3 # $3 = prod_low
+ ldq $5,0($16) # $5 = *res_ptr
+ addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ umulh $2,$19,$4 # $4 = cy_limb
+ addq $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addq $5,$3,$3
+ cmpult $3,$5,$5
+ stq $3,0($16)
+ addq $5,$0,$0 # combine carries
+ addq $4,$0,$0 # cy_limb = prod_high + cy
+ ret $31,($26),1
+.Lend1: addq $5,$3,$3
+ cmpult $3,$5,$5
+ stq $3,0($16)
+ addq $0,$5,$0
+ ret $31,($26),1
+
+ .end __mpn_addmul_1
diff --git a/ghc/rts/gmp/mpn/alpha/ev5/add_n.s b/ghc/rts/gmp/mpn/alpha/ev5/add_n.s
new file mode 100644
index 0000000000..1251a1fb71
--- /dev/null
+++ b/ghc/rts/gmp/mpn/alpha/ev5/add_n.s
@@ -0,0 +1,148 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subq $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldq $0,0($18)
+ ldq $1,8($18)
+ ldq $4,0($17)
+ ldq $5,8($17)
+ addq $17,32,$17 # update s1_ptr
+ ldq $2,16($18)
+ addq $0,$4,$20 # 1st main add
+ ldq $3,24($18)
+ subq $19,4,$19 # decr loop cnt
+ ldq $6,-16($17)
+ cmpult $20,$0,$25 # compute cy from last add
+ ldq $7,-8($17)
+ addq $1,$25,$28 # cy add
+ addq $18,32,$18 # update s2_ptr
+ addq $5,$28,$21 # 2nd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $21,$28,$25 # compute cy from last add
+ ldq $0,0($18)
+ or $8,$25,$25 # combine cy from the two adds
+ ldq $1,8($18)
+ addq $2,$25,$28 # cy add
+ ldq $4,0($17)
+ addq $28,$6,$22 # 3rd main add
+ ldq $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+ stq $21,8($16)
+ addq $3,$25,$28 # cy add
+ addq $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ addq $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ addq $0,$25,$28 # cy add
+ ldq $2,16($18)
+ addq $4,$28,$20 # 1st main add
+ ldq $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldq $6,-16($17)
+ cmpult $20,$28,$25 # compute cy from last add
+ ldq $7,-8($17)
+ or $8,$25,$25 # combine cy from the two adds
+ subq $19,4,$19 # decr loop cnt
+ stq $22,-16($16)
+ addq $1,$25,$28 # cy add
+ stq $23,-8($16)
+ addq $5,$28,$21 # 2nd main add
+ addq $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $2,$25,$28 # cy add
+ addq $28,$6,$22 # 3rd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+ stq $21,8($16)
+ addq $3,$25,$28 # cy add
+ addq $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ stq $22,-16($16)
+ stq $23,-8($16)
+.Lend2: addq $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldq $0,0($18)
+ ldq $4,0($17)
+ subq $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addq $0,$25,$28 # cy add
+ ldq $0,8($18)
+ addq $4,$28,$20 # main add
+ ldq $4,8($17)
+ addq $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addq $17,8,$17
+ stq $20,0($16)
+ cmpult $20,$28,$25 # compute cy from last add
+ subq $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,8,$16
+ bne $19,.Loop0
+.Lend0: addq $0,$25,$28 # cy add
+ addq $4,$28,$20 # main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $20,$28,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_add_n
diff --git a/ghc/rts/gmp/mpn/alpha/ev5/lshift.s b/ghc/rts/gmp/mpn/alpha/ev5/lshift.s
new file mode 100644
index 0000000000..ced55b7203
--- /dev/null
+++ b/ghc/rts/gmp/mpn/alpha/ev5/lshift.s
@@ -0,0 +1,174 @@
+ # Alpha EV5 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the EV5.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addq $18,$17,$17 # make r17 point at end of s1
+ ldq $4,-8($17) # load first limb
+ subq $31,$19,$20
+ s8addq $18,$16,$16 # make r16 point at end of RES
+ subq $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ srl $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subq $18,$28,$18
+
+ .align 3
+.Loop0: ldq $3,-16($17)
+ subq $16,8,$16
+ sll $4,$19,$5
+ subq $17,8,$17
+ subq $28,1,$28
+ srl $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stq $8,0($16)
+ bne $28,.Loop0
+
+.L0: sll $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldq $1,-16($17)
+ subq $18,4,$18
+ ldq $2,-24($17)
+ ldq $3,-32($17)
+ ldq $4,-40($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ ldq $1,-48($17)
+ sll $2,$19,$22
+ ldq $2,-56($17)
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ ldq $3,-64($17)
+ sll $4,$19,$24
+ ldq $4,-72($17)
+ subq $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stq $7,-8($16)
+ or $5,$22,$5
+ stq $8,-16($16)
+ or $6,$23,$6
+
+ srl $1,$20,$7
+ subq $18,4,$18
+ sll $1,$19,$21
+ unop # ldq $31,-96($17)
+
+ srl $2,$20,$8
+ ldq $1,-80($17)
+ sll $2,$19,$22
+ ldq $2,-88($17)
+
+ stq $5,-24($16)
+ or $7,$24,$7
+ stq $6,-32($16)
+ or $8,$21,$8
+
+ srl $3,$20,$5
+ unop # ldq $31,-96($17)
+ sll $3,$19,$23
+ subq $16,32,$16
+
+ srl $4,$20,$6
+ ldq $3,-96($17)
+ sll $4,$19,$24
+ ldq $4,-104($17)
+
+ subq $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stq $7,-8($16)
+ or $5,$22,$5
+ stq $8,-16($16)
+ or $6,$23,$6
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ stq $5,-24($16)
+ or $7,$24,$7
+ stq $6,-32($16)
+ or $8,$21,$8
+ srl $3,$20,$5
+ sll $3,$19,$23
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 2/2
+ stq $7,-40($16)
+ or $5,$22,$5
+ stq $8,-48($16)
+ or $6,$23,$6
+ stq $5,-56($16)
+ stq $6,-64($16)
+ # cool down phase 2/3
+ stq $24,-72($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 1/2
+ stq $7,-8($16)
+ or $5,$22,$5
+ stq $8,-16($16)
+ or $6,$23,$6
+ stq $5,-24($16)
+ stq $6,-32($16)
+ stq $24,-40($16)
+ ret $31,($26),1
+
+.Lend: stq $24,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
diff --git a/ghc/rts/gmp/mpn/alpha/ev5/rshift.s b/ghc/rts/gmp/mpn/alpha/ev5/rshift.s
new file mode 100644
index 0000000000..6e24fef965
--- /dev/null
+++ b/ghc/rts/gmp/mpn/alpha/ev5/rshift.s
@@ -0,0 +1,172 @@
+ # Alpha EV5 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the EV5.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldq $4,0($17) # load first limb
+ subq $31,$19,$20
+ subq $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ sll $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subq $18,$28,$18
+
+ .align 3
+.Loop0: ldq $3,8($17)
+ addq $16,8,$16
+ srl $4,$19,$5
+ addq $17,8,$17
+ subq $28,1,$28
+ sll $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stq $8,-8($16)
+ bne $28,.Loop0
+
+.L0: srl $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldq $1,8($17)
+ subq $18,4,$18
+ ldq $2,16($17)
+ ldq $3,24($17)
+ ldq $4,32($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ ldq $1,40($17)
+ srl $2,$19,$22
+ ldq $2,48($17)
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ ldq $3,56($17)
+ srl $4,$19,$24
+ ldq $4,64($17)
+ subq $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stq $7,0($16)
+ or $5,$22,$5
+ stq $8,8($16)
+ or $6,$23,$6
+
+ sll $1,$20,$7
+ subq $18,4,$18
+ srl $1,$19,$21
+ unop # ldq $31,-96($17)
+
+ sll $2,$20,$8
+ ldq $1,72($17)
+ srl $2,$19,$22
+ ldq $2,80($17)
+
+ stq $5,16($16)
+ or $7,$24,$7
+ stq $6,24($16)
+ or $8,$21,$8
+
+ sll $3,$20,$5
+ unop # ldq $31,-96($17)
+ srl $3,$19,$23
+ addq $16,32,$16
+
+ sll $4,$20,$6
+ ldq $3,88($17)
+ srl $4,$19,$24
+ ldq $4,96($17)
+
+ addq $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stq $7,0($16)
+ or $5,$22,$5
+ stq $8,8($16)
+ or $6,$23,$6
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ stq $5,16($16)
+ or $7,$24,$7
+ stq $6,24($16)
+ or $8,$21,$8
+ sll $3,$20,$5
+ srl $3,$19,$23
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 2/2
+ stq $7,32($16)
+ or $5,$22,$5
+ stq $8,40($16)
+ or $6,$23,$6
+ stq $5,48($16)
+ stq $6,56($16)
+ # cool down phase 2/3
+ stq $24,64($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 1/2
+ stq $7,0($16)
+ or $5,$22,$5
+ stq $8,8($16)
+ or $6,$23,$6
+ stq $5,16($16)
+ stq $6,24($16)
+ stq $24,32($16)
+ ret $31,($26),1
+
+.Lend: stq $24,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
diff --git a/ghc/rts/gmp/mpn/alpha/ev5/sub_n.s b/ghc/rts/gmp/mpn/alpha/ev5/sub_n.s
new file mode 100644
index 0000000000..6743af50b8
--- /dev/null
+++ b/ghc/rts/gmp/mpn/alpha/ev5/sub_n.s
@@ -0,0 +1,149 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subq $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldq $0,0($18)
+ ldq $1,8($18)
+ ldq $4,0($17)
+ ldq $5,8($17)
+ addq $17,32,$17 # update s1_ptr
+ ldq $2,16($18)
+ subq $4,$0,$20 # 1st main sub
+ ldq $3,24($18)
+ subq $19,4,$19 # decr loop cnt
+ ldq $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last sub
+ ldq $7,-8($17)
+ addq $1,$25,$28 # cy add
+ addq $18,32,$18 # update s2_ptr
+ subq $5,$28,$21 # 2nd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $5,$21,$25 # compute cy from last add
+ ldq $0,0($18)
+ or $8,$25,$25 # combine cy from the two adds
+ ldq $1,8($18)
+ addq $2,$25,$28 # cy add
+ ldq $4,0($17)
+ subq $6,$28,$22 # 3rd main sub
+ ldq $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+ stq $21,8($16)
+ addq $3,$25,$28 # cy add
+ subq $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ addq $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ addq $0,$25,$28 # cy add
+ ldq $2,16($18)
+ subq $4,$28,$20 # 1st main sub
+ ldq $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldq $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last add
+ ldq $7,-8($17)
+ or $8,$25,$25 # combine cy from the two adds
+ subq $19,4,$19 # decr loop cnt
+ stq $22,-16($16)
+ addq $1,$25,$28 # cy add
+ stq $23,-8($16)
+ subq $5,$28,$21 # 2nd main sub
+ addq $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $2,$25,$28 # cy add
+ subq $6,$28,$22 # 3rd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+ stq $21,8($16)
+ addq $3,$25,$28 # cy add
+ subq $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,32,$16 # update res_ptr
+ stq $22,-16($16)
+ stq $23,-8($16)
+.Lend2: addq $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldq $0,0($18)
+ ldq $4,0($17)
+ subq $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addq $0,$25,$28 # cy add
+ ldq $0,8($18)
+ subq $4,$28,$20 # main sub
+ ldq $1,8($17)
+ addq $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addq $17,8,$17
+ stq $20,0($16)
+ cmpult $4,$20,$25 # compute cy from last add
+ subq $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two adds
+ addq $16,8,$16
+ or $1,$31,$4
+ bne $19,.Loop0
+.Lend0: addq $0,$25,$28 # cy add
+ subq $4,$28,$20 # main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $4,$20,$25 # compute cy from last add
+ stq $20,0($16)
+ or $8,$25,$25 # combine cy from the two adds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_sub_n
diff --git a/ghc/rts/gmp/mpn/alpha/lshift.s b/ghc/rts/gmp/mpn/alpha/lshift.s
new file mode 100644
index 0000000000..13bd24a427
--- /dev/null
+++ b/ghc/rts/gmp/mpn/alpha/lshift.s
@@ -0,0 +1,109 @@
+ # Alpha 21064 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
+ # it would take 4 cycles/limb. It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions. But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addq $18,$17,$17 # make r17 point at end of s1
+ ldq $4,-8($17) # load first limb
+ subq $17,8,$17
+ subq $31,$19,$7
+ s8addq $18,$16,$16 # make r16 point at end of RES
+ subq $18,1,$18
+ and $18,4-1,$20 # number of limbs in first loop
+ srl $4,$7,$0 # compute function result
+
+ beq $20,.L0
+ subq $18,$20,$18
+
+ .align 3
+.Loop0:
+ ldq $3,-8($17)
+ subq $16,8,$16
+ subq $17,8,$17
+ subq $20,1,$20
+ sll $4,$19,$5
+ srl $3,$7,$6
+ bis $3,$3,$4
+ bis $5,$6,$8
+ stq $8,0($16)
+ bne $20,.Loop0
+
+.L0: beq $18,.Lend
+
+ .align 3
+.Loop: ldq $3,-8($17)
+ subq $16,32,$16
+ subq $18,4,$18
+ sll $4,$19,$5
+ srl $3,$7,$6
+
+ ldq $4,-16($17)
+ sll $3,$19,$1
+ bis $5,$6,$8
+ stq $8,24($16)
+ srl $4,$7,$2
+
+ ldq $3,-24($17)
+ sll $4,$19,$5
+ bis $1,$2,$8
+ stq $8,16($16)
+ srl $3,$7,$6
+
+ ldq $4,-32($17)
+ sll $3,$19,$1
+ bis $5,$6,$8
+ stq $8,8($16)
+ srl $4,$7,$2
+
+ subq $17,32,$17
+ bis $1,$2,$8
+ stq $8,0($16)
+
+ bgt $18,.Loop
+
+.Lend: sll $4,$19,$8
+ stq $8,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
diff --git a/ghc/rts/gmp/mpn/alpha/mul_1.s b/ghc/rts/gmp/mpn/alpha/mul_1.s
new file mode 100644
index 0000000000..a1f5a94b9e
--- /dev/null
+++ b/ghc/rts/gmp/mpn/alpha/mul_1.s
@@ -0,0 +1,85 @@
+ # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ # the result in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # s2_limb r19
+
+ # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture. 2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR. Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_mul_1
+ .ent __mpn_mul_1 2
+__mpn_mul_1:
+ .frame $30,0,$26
+
+ ldq $2,0($17) # $2 = s1_limb
+ subq $18,1,$18 # size--
+ mulq $2,$19,$3 # $3 = prod_low
+ bic $31,$31,$4 # clear cy_limb
+ umulh $2,$19,$0 # $0 = prod_high
+ beq $18,Lend1 # jump if size was == 1
+ ldq $2,8($17) # $2 = s1_limb
+ subq $18,1,$18 # size--
+ stq $3,0($16)
+ beq $18,Lend2 # jump if size was == 2
+
+ .align 3
+Loop: mulq $2,$19,$3 # $3 = prod_low
+ addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ subq $18,1,$18 # size--
+ umulh $2,$19,$4 # $4 = cy_limb
+ ldq $2,16($17) # $2 = s1_limb
+ addq $17,8,$17 # s1_ptr++
+ addq $3,$0,$3 # $3 = cy_limb + prod_low
+ stq $3,8($16)
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addq $16,8,$16 # res_ptr++
+ bne $18,Loop
+
+Lend2: mulq $2,$19,$3 # $3 = prod_low
+ addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ umulh $2,$19,$4 # $4 = cy_limb
+ addq $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ stq $3,8($16)
+ addq $4,$0,$0 # cy_limb = prod_high + cy
+ ret $31,($26),1
+Lend1: stq $3,0($16)
+ ret $31,($26),1
+
+ .end __mpn_mul_1
diff --git a/ghc/rts/gmp/mpn/alpha/rshift.s b/ghc/rts/gmp/mpn/alpha/rshift.s
new file mode 100644
index 0000000000..389054ab0e
--- /dev/null
+++ b/ghc/rts/gmp/mpn/alpha/rshift.s
@@ -0,0 +1,107 @@
+ # Alpha 21064 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
+ # it would take 4 cycles/limb. It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions. But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldq $4,0($17) # load first limb
+ addq $17,8,$17
+ subq $31,$19,$7
+ subq $18,1,$18
+ and $18,4-1,$20 # number of limbs in first loop
+ sll $4,$7,$0 # compute function result
+
+ beq $20,.L0
+ subq $18,$20,$18
+
+ .align 3
+.Loop0:
+ ldq $3,0($17)
+ addq $16,8,$16
+ addq $17,8,$17
+ subq $20,1,$20
+ srl $4,$19,$5
+ sll $3,$7,$6
+ bis $3,$3,$4
+ bis $5,$6,$8
+ stq $8,-8($16)
+ bne $20,.Loop0
+
+.L0: beq $18,.Lend
+
+ .align 3
+.Loop: ldq $3,0($17)
+ addq $16,32,$16
+ subq $18,4,$18
+ srl $4,$19,$5
+ sll $3,$7,$6
+
+ ldq $4,8($17)
+ srl $3,$19,$1
+ bis $5,$6,$8
+ stq $8,-32($16)
+ sll $4,$7,$2
+
+ ldq $3,16($17)
+ srl $4,$19,$5
+ bis $1,$2,$8
+ stq $8,-24($16)
+ sll $3,$7,$6
+
+ ldq $4,24($17)
+ srl $3,$19,$1
+ bis $5,$6,$8
+ stq $8,-16($16)
+ sll $4,$7,$2
+
+ addq $17,32,$17
+ bis $1,$2,$8
+ stq $8,-8($16)
+
+ bgt $18,.Loop
+
+.Lend: srl $4,$19,$8
+ stq $8,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
diff --git a/ghc/rts/gmp/mpn/alpha/sub_n.s b/ghc/rts/gmp/mpn/alpha/sub_n.s
new file mode 100644
index 0000000000..3c90c11697
--- /dev/null
+++ b/ghc/rts/gmp/mpn/alpha/sub_n.s
@@ -0,0 +1,120 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ ldq $3,0($17)
+ ldq $4,0($18)
+
+ subq $19,1,$19
+ and $19,4-1,$2 # number of limbs in first loop
+ bis $31,$31,$0
+ beq $2,.L0 # if multiple of 4 limbs, skip first loop
+
+ subq $19,$2,$19
+
+.Loop0: subq $2,1,$2
+ ldq $5,8($17)
+ addq $4,$0,$4
+ ldq $6,8($18)
+ cmpult $4,$0,$1
+ subq $3,$4,$4
+ cmpult $3,$4,$0
+ stq $4,0($16)
+ or $0,$1,$0
+
+ addq $17,8,$17
+ addq $18,8,$18
+ bis $5,$5,$3
+ bis $6,$6,$4
+ addq $16,8,$16
+ bne $2,.Loop0
+
+.L0: beq $19,.Lend
+
+ .align 3
+.Loop: subq $19,4,$19
+
+ ldq $5,8($17)
+ addq $4,$0,$4
+ ldq $6,8($18)
+ cmpult $4,$0,$1
+ subq $3,$4,$4
+ cmpult $3,$4,$0
+ stq $4,0($16)
+ or $0,$1,$0
+
+ ldq $3,16($17)
+ addq $6,$0,$6
+ ldq $4,16($18)
+ cmpult $6,$0,$1
+ subq $5,$6,$6
+ cmpult $5,$6,$0
+ stq $6,8($16)
+ or $0,$1,$0
+
+ ldq $5,24($17)
+ addq $4,$0,$4
+ ldq $6,24($18)
+ cmpult $4,$0,$1
+ subq $3,$4,$4
+ cmpult $3,$4,$0
+ stq $4,16($16)
+ or $0,$1,$0
+
+ ldq $3,32($17)
+ addq $6,$0,$6
+ ldq $4,32($18)
+ cmpult $6,$0,$1
+ subq $5,$6,$6
+ cmpult $5,$6,$0
+ stq $6,24($16)
+ or $0,$1,$0
+
+ addq $17,32,$17
+ addq $18,32,$18
+ addq $16,32,$16
+ bne $19,.Loop
+
+.Lend: addq $4,$0,$4
+ cmpult $4,$0,$1
+ subq $3,$4,$4
+ cmpult $3,$4,$0
+ stq $4,0($16)
+ or $0,$1,$0
+ ret $31,($26),1
+
+ .end __mpn_sub_n
diff --git a/ghc/rts/gmp/mpn/alpha/submul_1.s b/ghc/rts/gmp/mpn/alpha/submul_1.s
new file mode 100644
index 0000000000..1ed0c6a8d9
--- /dev/null
+++ b/ghc/rts/gmp/mpn/alpha/submul_1.s
@@ -0,0 +1,92 @@
+ # Alpha 21064 __mpn_submul_1 -- Multiply a limb vector with a limb and
+ # subtract the result from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # s2_limb r19
+
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_submul_1
+ .ent __mpn_submul_1 2
+__mpn_submul_1:
+ .frame $30,0,$26
+
+ ldq $2,0($17) # $2 = s1_limb
+ addq $17,8,$17 # s1_ptr++
+ subq $18,1,$18 # size--
+ mulq $2,$19,$3 # $3 = prod_low
+ ldq $5,0($16) # $5 = *res_ptr
+ umulh $2,$19,$0 # $0 = prod_high
+ beq $18,.Lend1 # jump if size was == 1
+ ldq $2,0($17) # $2 = s1_limb
+ addq $17,8,$17 # s1_ptr++
+ subq $18,1,$18 # size--
+ subq $5,$3,$3
+ cmpult $5,$3,$4
+ stq $3,0($16)
+ addq $16,8,$16 # res_ptr++
+ beq $18,.Lend2 # jump if size was == 2
+
+ .align 3
+.Loop: mulq $2,$19,$3 # $3 = prod_low
+ ldq $5,0($16) # $5 = *res_ptr
+ addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ subq $18,1,$18 # size--
+ umulh $2,$19,$4 # $4 = cy_limb
+ ldq $2,0($17) # $2 = s1_limb
+ addq $17,8,$17 # s1_ptr++
+ addq $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ subq $5,$3,$3
+ cmpult $5,$3,$5
+ stq $3,0($16)
+ addq $16,8,$16 # res_ptr++
+ addq $5,$0,$0 # combine carries
+ bne $18,.Loop
+
+.Lend2: mulq $2,$19,$3 # $3 = prod_low
+ ldq $5,0($16) # $5 = *res_ptr
+ addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ umulh $2,$19,$4 # $4 = cy_limb
+ addq $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ subq $5,$3,$3
+ cmpult $5,$3,$5
+ stq $3,0($16)
+ addq $5,$0,$0 # combine carries
+ addq $4,$0,$0 # cy_limb = prod_high + cy
+ ret $31,($26),1
+.Lend1: subq $5,$3,$3
+ cmpult $5,$3,$5
+ stq $3,0($16)
+ addq $0,$5,$0
+ ret $31,($26),1
+
+ .end __mpn_submul_1