Reorganisation of the source tree

Most of the other users of the fptools build system have migrated to Cabal, and with the move to darcs we can now flatten the source tree without losing history, so here goes. The main change is that the ghc/ subdir is gone, and most of what it contained is now at the top level. The build system now makes no pretense at being multi-project, it is just the GHC build system. No doubt this will break many things, and there will be a period of instability while we fix the dependencies. A straightforward build should work, but I haven't yet fixed binary/source distributions. Changes to the Building Guide will follow, too.
author: Simon Marlow <simonmar@microsoft.com> 2006-04-07 02:05:11 +0000
committer: Simon Marlow <simonmar@microsoft.com> 2006-04-07 02:05:11 +0000
commit: 0065d5ab628975892cea1ec7303f968c3338cbe1 (patch)
tree: 8e2afe0ab48ee33cf95009809d67c9649573ef92 /rts/gmp/mpn/pa64
parent: 28a464a75e14cece5db40f2765a29348273ff2d2 (diff)
download: haskell-0065d5ab628975892cea1ec7303f968c3338cbe1.tar.gz
11 files changed, 1166 insertions, 0 deletions
diff --git a/rts/gmp/mpn/pa64/README b/rts/gmp/mpn/pa64/README
new file mode 100644
index 0000000000..8d2976dabc
--- /dev/null
+++ b/rts/gmp/mpn/pa64/README
@@ -0,0 +1,38 @@
+This directory contains mpn functions for 64-bit PA-RISC 2.0.
+
+RELEVANT OPTIMIZATION ISSUES
+
+The PA8000 has a multi-issue pipeline with large buffers for instructions
+awaiting pending results.  Therefore, no latency scheduling is necessary
+(and might actually be harmful).
+
+Two 64-bit loads can be completed per cycle.  One 64-bit store can be
+completed per cycle.  A store cannot complete in the same cycle as a load.
+
+STATUS
+
+* mpn_lshift, mpn_rshift, mpn_add_n, mpn_sub_n are all well-tuned and run at
+  the peak cache bandwidth; 1.5 cycles/limb for shifting and 2.0 cycles/limb
+  for add/subtract.
+
+* The multiplication functions run at 11 cycles/limb.  The cache bandwidth
+  allows 7.5 cycles/limb.  Perhaps it would be possible, using unrolling or
+  better scheduling, to get closer to the cache bandwidth limit.
+
+* xaddmul_1.S contains a quicker method for forming the 128 bit product.  It
+  uses some fewer operations, and keep the carry flag live across the loop
+  boundary.  But it seems hard to make it run more than 1/4 cycle faster
+  than the old code.  Perhaps we really ought to unroll this loop be 2x?
+  2x should suffice since register latency schedling is never needed,
+  but the unrolling would hide the store-load latency.  Here is a sketch:
+
+	1. A multiply and store 64-bit products
+	2. B sum 64-bit products 128-bit product
+	3. B load  64-bit products to integer registers
+	4. B multiply and store 64-bit products
+	5. A sum 64-bit products 128-bit product
+	6. A load  64-bit products to integer registers
+	7. goto 1
+
+  In practice, adjacent groups (1 and 2, 2 and 3, etc) will be interleaved
+  for better instruction mix.
diff --git a/rts/gmp/mpn/pa64/add_n.s b/rts/gmp/mpn/pa64/add_n.s
new file mode 100644
index 0000000000..22ff19c184
--- /dev/null
+++ b/rts/gmp/mpn/pa64/add_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+; store sum in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_add_n,entry
+__gmpn_add_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	sub		%r26,%r22,%r26		; offset res_ptr
+	blr		%r28,%r0		; branch into loop
+	add		%r0,%r0,%r0		; reset carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	bve		(%r2)
+	.exit
+	ldi		0,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/addmul_1.S b/rts/gmp/mpn/pa64/addmul_1.S
new file mode 100644
index 0000000000..b1885b432c
--- /dev/null
+++ b/rts/gmp/mpn/pa64/addmul_1.S
@@ -0,0 +1,167 @@
+; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_addmul_1,entry
+__gmpn_addmul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,rlimb,rlimb
+	add,dc		t2,hi,cylimb
+	add		t4,rlimb,t3
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/gmp-mparam.h b/rts/gmp/mpn/pa64/gmp-mparam.h
new file mode 100644
index 0000000000..847735b987
--- /dev/null
+++ b/rts/gmp/mpn/pa64/gmp-mparam.h
@@ -0,0 +1,65 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values were measured in a PA8000 using the system compiler version
+   A.10.32.30.  Presumably the PA8200 and PA8500 have the same timing
+   characteristic, but GCC might give somewhat different results.  */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   16
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      105
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   40
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      116
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              72
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             94
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            50
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD       46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           1
+#endif
diff --git a/rts/gmp/mpn/pa64/lshift.s b/rts/gmp/mpn/pa64/lshift.s
new file mode 100644
index 0000000000..994bc1c4d6
--- /dev/null
+++ b/rts/gmp/mpn/pa64/lshift.s
@@ -0,0 +1,103 @@
+; HP-PA 2.0 __gmpn_lshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_lshift,entry
+__gmpn_lshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	shladd		%r24,3,%r25,%r25
+	shladd		%r24,3,%r26,%r26
+	subi		64,%r23,%r23
+	mtsar		%r23
+	ldd		-8(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r0,%r21,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	add		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	add		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		-16(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-8(%r26)
+L$7	ldd		-24(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-16(%r26)
+L$6	ldd		-32(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-24(%r26)
+L$5	ldd		-40(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-32(%r26)
+L$4	ldd		-48(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-40(%r26)
+L$3	ldd		-56(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-48(%r26)
+L$2	ldd		-64(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-56(%r26)
+L$1	ldd		-72(%r25),%r21
+	ldo		-64(%r25),%r25
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-64(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		-64(%r26),%r26
+
+L$end	shrpd		%r21,%r0,%sar,%r21
+	std		%r21,-8(%r26)
+	bve		(%r2)
+	.exit
+	extrd,u		%r29,31,32,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/mul_1.S b/rts/gmp/mpn/pa64/mul_1.S
new file mode 100644
index 0000000000..ab310c1264
--- /dev/null
+++ b/rts/gmp/mpn/pa64/mul_1.S
@@ -0,0 +1,158 @@
+; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and
+; store the result in a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_mul_1,entry
+__gmpn_mul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t2 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t3
+	add,dc		t2,hi,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/rshift.s b/rts/gmp/mpn/pa64/rshift.s
new file mode 100644
index 0000000000..f0730e2a91
--- /dev/null
+++ b/rts/gmp/mpn/pa64/rshift.s
@@ -0,0 +1,100 @@
+; HP-PA 2.0 __gmpn_rshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_rshift,entry
+__gmpn_rshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	mtsar		%r23
+	ldd		0(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r21,%r0,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	sub		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		8(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,0(%r26)
+L$7	ldd		16(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,8(%r26)
+L$6	ldd		24(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,16(%r26)
+L$5	ldd		32(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,24(%r26)
+L$4	ldd		40(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,32(%r26)
+L$3	ldd		48(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,40(%r26)
+L$2	ldd		56(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,48(%r26)
+L$1	ldd		64(%r25),%r21
+	ldo		64(%r25),%r25
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,56(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		64(%r26),%r26
+
+L$end	shrpd		%r0,%r21,%sar,%r21
+	std		%r21,0(%r26)
+	bve		(%r2)
+	.exit
+	extrd,u		%r29,31,32,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/sub_n.s b/rts/gmp/mpn/pa64/sub_n.s
new file mode 100644
index 0000000000..dda1f54b34
--- /dev/null
+++ b/rts/gmp/mpn/pa64/sub_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+; and store difference in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_sub_n,entry
+__gmpn_sub_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	blr		%r28,%r0		; branch into loop
+	sub		%r26,%r22,%r26		; offset res_ptr and set carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	subi		1,%r29,%r29
+	bve		(%r2)
+	.exit
+	ldi		0,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/submul_1.S b/rts/gmp/mpn/pa64/submul_1.S
new file mode 100644
index 0000000000..27666b99df
--- /dev/null
+++ b/rts/gmp/mpn/pa64/submul_1.S
@@ -0,0 +1,170 @@
+; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_submul_1,entry
+__gmpn_submul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t4
+	add,dc		t2,hi,cylimb
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/udiv_qrnnd.c b/rts/gmp/mpn/pa64/udiv_qrnnd.c
new file mode 100644
index 0000000000..1c9fe084db
--- /dev/null
+++ b/rts/gmp/mpn/pa64/udiv_qrnnd.c
@@ -0,0 +1,111 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define TWO64 18446744073709551616.0
+
+mp_limb_t
+#if __STDC__
+__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r)
+#else
+__MPN(udiv_qrnnd) (n1, n0, d, r)
+     mp_limb_t n1;
+     mp_limb_t n0;
+     mp_limb_t d;
+     mp_limb_t *r;
+#endif
+{
+  mp_limb_t q1, q2, q;
+  mp_limb_t p1, p0;
+  double di, dq;
+
+  di = 1.0 / d;
+
+  /* Generate upper 53 bits of quotient.  Be careful here; the `double'
+     quotient may be rounded to 2^64 which we cannot safely convert back
+     to a 64-bit integer.  */
+  dq = (TWO64 * (double) n1 + (double) n0) * di;
+  if (dq >= TWO64)
+    q1 = 0xfffffffffffff800LL;
+  else
+    q1 = (mp_limb_t) dq;
+
+  /* Multiply back in order to compare the product to the dividend.  */
+  umul_ppmm (p1, p0, q1, d);
+
+  /* Was the 53-bit quotient greater that our sought quotient?  Test the
+     sign of the partial remainder to find out.  */
+  if (n1 < p1 || (n1 == p1 && n0 < p0))
+    {
+      /* 53-bit quotient too large.  Partial remainder is negative.
+	 Compute the absolute value of the remainder in n1,,n0.  */
+      n1 = p1 - (n1 + (p0 < n0));
+      n0 = p0 - n0;
+
+      /* Now use the partial remainder as new dividend to compute more bits of
+	 quotient.  This is an adjustment for the one we got previously.  */
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 - q2;
+      if (n1 < p1 || (n1 == p1 && n0 <= p0))
+	{
+	  n0 = p0 - n0;
+	}
+      else
+	{
+	  n0 = p0 - n0;
+	  n0 += d;
+	  q--;
+	}
+    }
+  else
+    {
+      n1 = n1 - (p1 + (n0 < p0));
+      n0 = n0 - p0;
+
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 + q2;
+      if (n1 < p1 || (n1 == p1 && n0 < p0))
+	{
+	  n0 = n0 - p0;
+	  n0 += d;
+	  q--;
+	}
+      else
+	{
+	  n0 = n0 - p0;
+	  if (n0 >= d)
+	    {
+	      n0 -= d;
+	      q++;
+	    }
+	}
+    }
+
+  *r = n0;
+  return q;
+}
diff --git a/rts/gmp/mpn/pa64/umul_ppmm.S b/rts/gmp/mpn/pa64/umul_ppmm.S
new file mode 100644
index 0000000000..ceff2d752f
--- /dev/null
+++ b/rts/gmp/mpn/pa64/umul_ppmm.S
@@ -0,0 +1,74 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+#define p0	%r28
+#define p1	%r29
+#define t32	%r19
+#define t0	%r20
+#define t1	%r21
+#define x	%r22
+#define m0	%r23
+#define m1	%r24
+	.level  2.0n
+	.code
+	.export __gmpn_umul_ppmm,entry
+__gmpn_umul_ppmm
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	ldo		128(%r30),%r30
+	depd		%r25,31,32,%r26
+	std		%r26,-64(%r30)
+	depd		%r23,31,32,%r24
+	std		%r24,-56(%r30)
+
+	ldw		-180(%r30),%r31
+
+        fldd		-64(%r30),%fr4
+        fldd		-56(%r30),%fr5
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+
+	depdi,z		1,31,1,t32		; t32 = 2^32
+
+	ldd		-128(%r30),p0		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),p1		; hi = high 64 bit of product
+
+	add,l,*nuv	m0,m1,x			; x = m1+m0
+	 add,l		t32,p1,p1		; propagate carry to mid of p1
+	depd,z		x,31,32,t0		; lo32(m1+m0)
+	add		t0,p0,p0
+	extrd,u		x,31,32,t1		; hi32(m1+m0)
+	add,dc		t1,p1,p1
+
+	std		p0,0(%r31)		; store low half of product
+	extrd,u		p1,31,32,%r28		; return high half of product
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
author	Simon Marlow <simonmar@microsoft.com>	2006-04-07 02:05:11 +0000
committer	Simon Marlow <simonmar@microsoft.com>	2006-04-07 02:05:11 +0000
commit	0065d5ab628975892cea1ec7303f968c3338cbe1 (patch)
tree	8e2afe0ab48ee33cf95009809d67c9649573ef92 /rts/gmp/mpn/pa64
parent	28a464a75e14cece5db40f2765a29348273ff2d2 (diff)
download	haskell-0065d5ab628975892cea1ec7303f968c3338cbe1.tar.gz